def train_model(self, sars, done):
        (state, action, reward, next_state) = sars

        state = u.t_float32(state)
        action = u.t_float32(action)
        reward = u.t_float32(reward)
        next_state = u.t_float32(next_state)

        value = self.critic(state)
        next_value = self.critic(next_state)

        if done:
            advantage = reward - value
            target = reward
        else:
            advantage = (reward + self.discount_factor * next_value) - value
            target = reward + self.discount_factor * next_value

        self.actor_optimizer.zero_grad()
        probs = self.actor(state)
        actor_loss = -Categorical(probs).log_prob(action) * advantage
        actor_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss = torch.mean((target.detach() - self.critic(state))**2)
        critic_loss.backward()
        self.critic_optimizer.step()

        if done:
            TrainerMetadata().log(critic_loss, 'critic_loss')
            TrainerMetadata().log(actor_loss, 'actor_loss')
Exemple #2
0
 def append_sample(self, sars, done):
     state, action, reward, next_state = sars
     state, action, reward = u.t_float32(state), u.t_float32(
         action), u.t_float32(reward)
     # FIXME: 이거 t_uint8로도 할 수 있을텐데 GAE 파트에서 실수 값에 Byte 곱한다고 에러 뿜뿜
     done = u.t_float32(done)
     transition = self.transition_structure(state, action, reward, done)
     self.memory.append(transition)
 def append_sample(self, sars, done):
     state, action, reward, next_state = sars
     self.memory.push(
         u.t_float32(state),
         u.t_float32(action),
         u.t_float32(reward),
         u.t_float32(next_state),
         u.t_uint8(done)
     )
 def critic_update(self, state, target):
     self.critic_optimizer.zero_grad()
     target = u.t_float32(target)
     loss = torch.mean((target - self.critic(state)) ** 2)
     loss.backward()
     self.critic_optimizer.step()
     return loss
Exemple #5
0
    def intrinsic_motivation_impl(self, i_episode, step, current_sars,
                                  current_done):
        # Random motivation
        current_state, current_action, current_reward, current_next_state = current_sars

        intrinsic_reward = torch.rand_like(u.t_float32(current_reward))
        intrinsic_reward = (self.a * intrinsic_reward) + self.b

        return intrinsic_reward
    def intrinsic_motivation_impl(self, i_episode, step, current_sars,
                                  current_done):
        # Predictive novelty motivation (NM)
        current_state, current_action, current_reward, current_next_state = current_sars

        state_prediction_error = self._train_model(
            u.t_float32(current_state), u.t_float32(current_action),
            u.t_float32(current_next_state))
        intrinsic_reward = self.intrinsic_scale_1 * state_prediction_error

        # TODO: 환경 평소 보상 (1) 정도로 clip 해줄까?
        # intrinsic_reward = torch.clamp(intrinsic_reward, min=-2, max=2)
        # TrainerMetadata().log('intrinsic_reward', torch.mean(intrinsic_reward))

        # TODO: 제일 처음 Expert망이 조금 학습된 다음에 내발적 동기 보상 리턴하기?
        # if self.delayed_start and (TrainerMetadata().global_step < i_episode + self.intrinsic_reward_start):
        #    return 0

        return intrinsic_reward
Exemple #7
0
    def intrinsic_motivation_impl(self, i_episode, step, current_sars,
                                  current_done):
        # Learning progress motivation (LPM)
        current_state, current_action, current_reward, current_next_state = current_sars

        examplar = ExemplarStructure(u.t_float32(current_state),
                                     u.t_float32(current_action),
                                     u.t_float32(current_next_state))
        self.region_manager.add(examplar)

        region = self.region_manager.find_region(examplar)
        past_error = region.get_past_error_mean()
        current_error = region.get_current_error_mean()
        intrinsic_reward = past_error - current_error

        # TODO: 환경 평소 보상 (1) 정도로 clip 해줄까?
        # intrinsic_reward_batch = torch.clamp(intrinsic_reward_batch, min=-2, max=2)
        # self.viz.draw_line(y=torch.mean(intrinsic_reward_batch), interval=1000, name="intrinsic_reward_batch")

        return intrinsic_reward
    def intrinsic_motivation_impl(self, i_episode, step, current_sars,
                                  current_done):
        # Predictive Surprise Motivation (SM)
        current_state, current_action, current_reward, current_next_state = current_sars

        state_prediction_error, meta_prediction_error = self._train_model(
            u.t_float32(current_state), u.t_float32(current_action),
            u.t_float32(current_next_state))
        intrinsic_reward = self.intrinsic_scale_1 * (state_prediction_error /
                                                     meta_prediction_error)

        # TODO: 환경 평소 보상 (1) 정도로 clip 해줄까?
        # intrinsic_reward = torch.clamp(intrinsic_reward, min=-2, max=2)

        # TODO: 제일 처음 Expert망이 조금 학습된 다음에 내발적 동기 보상 리턴하기?
        # if self.delayed_start and (TrainerMetadata().global_step < i_episode + self.intrinsic_reward_start):
        #    return 0

        # TODO: 멋대로 tanh를 넣어버렸다 ㅠ
        # intrinsic_reward = torch.tanh(intrinsic_reward).item()

        return intrinsic_reward
    def intrinsic_motivation_impl(self, i_episode, step, current_sars, current_done):
        # Predictive familiarity motivation (FM)
        current_state, current_action, current_reward, current_next_state = current_sars

        examplar = ExemplarStructure(
            u.t_float32(current_state),
            u.t_float32(current_action),
            u.t_float32(current_next_state)
        )
        self.region_manager.add(examplar)

        region = self.region_manager.find_region(examplar)
        current_error = region.get_current_error_mean()
        intrinsic_reward = self.intrinsic_scale_1 / current_error

        intrinsic_reward = u.t_float32(intrinsic_reward)
        # TrainerMetadata().log(value=intrinsic_reward, indicator='intrinsic_reward',
        # variable='raw', interval=1, show_only_last=False, compute_maxmin=False)
        intrinsic_reward = torch.clamp(intrinsic_reward, min=-2, max=2)
        # TrainerMetadata().log(value=intrinsic_reward, indicator='intrinsic_reward',
        # variable='clamp', interval=1, show_only_last=False, compute_maxmin=False)

        intrinsic_reward = intrinsic_reward.item()
        return intrinsic_reward
    def train_model(self, state, action, reward, next_state, done):
        state = u.t_from_np_to_float32(state)
        action = u.t_float32(action)
        next_state = u.t_from_np_to_float32(next_state)
        value = self.critic(state)
        next_value = self.critic(next_state)

        if done:
            advantage = reward - value
            target = reward
        else:
            advantage = (reward + self.discount_factor * next_value) - value
            target = reward + self.discount_factor * next_value

        actor_loss = self.actor_update(state, action, advantage)
        critic_loss = self.critic_update(state, target)

        return actor_loss, critic_loss
Exemple #11
0
    def train_model(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        # 메모리에서 일정 크기만큼 기억을 불러온다
        # 그 후 기억을 모아 각 변수별로 모은다. (즉, 전치행렬)
        transitions = self.memory.sample(self.batch_size)
        batch = self.transition_structure(*zip(*transitions))

        target = []
        target_val = []

        # 정책망에 각각의 기억에 대해 상태를 넣어서 각각의 액션 보상을 구한다.
        # 그 다음에 선택한 액션 쪽의 보상을 가져온다.
        for i in range(self.batch_size):
            state = batch.state[i]
            action = batch.action[i]
            target.append(self.policy_model(state).squeeze()[action])

        # 안 죽었을 때의 타겟망 보상 추정하기
        for i in range(self.batch_size):
            next_state = batch.next_state[i]
            target_val.append(self.target_model(next_state))

        # 기존 보상에 안 죽었을 때만 큐함수 추정을 더하기
        for i in range(self.batch_size):
            done = batch.done[i]
            reward = batch.reward[i]
            if done:
                target_val[i] = u.t_float32(reward).squeeze()
            else:
                target_val[i] = reward + self.discount_factor * torch.max(
                    target_val[i]).to(device)

        # 정책망의 예측 보상과 타겟망의 예측 보상을 MSE 비교
        self.policy_optimizer.zero_grad()
        loss = nn.MSELoss().to(device)
        loss = loss(torch.stack(target), torch.stack(target_val))
        loss.backward()
        self.policy_optimizer.step()

        # 그래프 그리기용
        return loss
Exemple #12
0
                            console_log_order=[
                                'Epoch',
                                'Score',
                                'Time',
                            ])

    if IS_LOAD:
        TrainerMetadata().load()

    # 최대 에피소드 수만큼 돌린다
    for i_episode in range(TrainerMetadata().current_epoch, EPISODES):
        TrainerMetadata().start_episode()

        agent.algorithm_rl.reset()
        state = env.reset()
        score = u.t_float32(0)

        # 각 에피소드당 환경에 정의된 최대 스텝 수만큼 돌린다
        # 단 그 전에 환경에서 정의된 종료 상태(done)가 나오면 거기서 끝낸다
        for t in range(env.spec.max_episode_steps):
            TrainerMetadata().start_step()

            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)

            sars = (state, action, reward, next_state)
            agent.algorithm_rl.append_sample(sars, done)

            if len(agent.algorithm_rl.memory
                   ) >= agent.algorithm_rl.train_start:
                agent.train_model(i_episode, t, sars, done)
Exemple #13
0
 def append_sample(self, state, action, reward, next_state, done):
     self.memory.push(u.t_float32(state), action, reward,
                      u.t_float32(next_state), u.t_uint8(done))
Exemple #14
0
    checkpoint_inst = Checkpoint(VERSION, IS_SAVE, SAVE_INTERVAL)
    """
    상태 공간 4개, 범위 -∞ < s < ∞
    행동 공간 1개, 이산값 0 or 1
    """
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    agent = DQNAgent(state_size, action_size)

    # 최대 에피소드 수만큼 돌린다
    for episode in range(metadata.current_epoch, EPISODES):
        start_time = time.time()
        state = env.reset()
        score = last_loss = u.t_float32(0)

        seongkwageup = 0
        # 각 에피소드당 환경에 정의된 최대 스텝 수만큼 돌린다
        # 단 그 전에 환경에서 정의된 종료 상태(done)가 나오면 거기서 끝낸다
        for t in range(env.spec.max_episode_steps):
            action = agent.get_action(state)
            next_state, reward, seokyong_reward, done, _ = god_seokyong_reward_method_step(
                action, t)
            seongkwageup += seokyong_reward
            # next_state, reward, seokyong_reward, done, _ = env.step(action)
            reward = reward if not done or score == 499 + seongkwageup else -400
            # reward = reward if not done or score == 499 else -100

            agent.append_sample(state, action, reward, next_state, done)