def run_one_test_episode(self, env) -> dict:
        state = env.reset()
        p_actual, p_reference = state

        state = np.array(state)
        eps = self.min_eps
        action_index = self.agent.get_action(state, eps)
        action = self.actions[action_index]
        output = {'p_actual': [p_actual], 'p_reference': [p_reference], 'action': [action]}

        for i in range(self.max_n_steps):
            next_state, reward, done, _ = env.step(action)
            p_actual, p_reference = next_state
            output['p_actual'].append(p_actual)
            output['p_reference'].append(p_reference)
            output['action'].append(action)
            state = np.array(next_state)
            action_index = self.agent.get_action(state, eps)
            action = self.actions[action_index]

            if done:
                break

        mse = calc_mse(output['p_actual'], output['p_reference'])
        output['mse'] = mse
        return output
    def run_one_train_episode(self, env) -> dict:
        state = env.reset()
        p_actual, p_reference = state
        state = np.array(state)
        eps = epsilon_annealing(self.episode_counter, self.eps_decrease_last_episode, self.min_eps)
        action_index = self.agent.get_action(state, eps)
        action = self.actions[action_index]
        output = {'p_actual': [p_actual], 'p_reference': [p_reference], 'action': [action]}

        for i in range(self.max_n_steps):
            next_state, reward, done, _ = env.step(action)
            p_actual, p_reference = next_state
            output['p_actual'].append(p_actual)
            output['p_reference'].append(p_reference)
            output['action'].append(action)
            next_state = np.array(next_state)
            self.replay_memory.push(state, action_index, reward, next_state, done)

            if len(self.replay_memory) > self.batch_size:
                minibatch = self.replay_memory.pop(self.batch_size)
                train_helper(self.agent, minibatch, self.gamma)

            state = next_state
            action_index = self.agent.get_action(state, eps)
            action = self.actions[action_index]

            if done:
                break

        mse = calc_mse(output['p_actual'], output['p_reference'])
        output['mse'] = mse
        self.episode_counter += 1
        return output
Esempio n. 3
0
    def run_one_test_episode(self, env) -> dict:
        state = env.reset()
        delta = 0
        p_actual, p_reference = state
        action = self.agent.use([*state, delta])
        output = {
            'p_actual': [p_actual],
            'p_reference': [p_reference],
            'action': [action]
        }

        for i in range(self.max_n_steps):
            next_state, reward, done, _ = env.step(action)
            next_delta = next_state[0] - state[0]
            p_actual, p_reference = next_state
            output['p_actual'].append(p_actual)
            output['p_reference'].append(p_reference)
            output['action'].append(action)

            action = self.agent.use([*next_state, next_delta])
            state = next_state
            if done:
                break
        mse = calc_mse(output['p_actual'], output['p_reference'])
        output['mse'] = mse
        return output
Esempio n. 4
0
    def run_one_train_episode(self, env) -> dict:
        state = env.reset()
        p_actual, p_reference = state
        output = {
            'p_actual': [p_actual],
            'p_reference': [p_reference],
            'action': [self.k]
        }

        for i in range(self.max_steps):
            obs, rew, done, _ = env.step(self.k)
            p_actual, p_reference = obs
            output['p_actual'].append(p_actual)
            output['p_reference'].append(p_reference)
            output['action'].append(self.k)
            if done:
                break
        mse = calc_mse(output['p_actual'], output['p_reference'])
        output['mse'] = mse
        return output
Esempio n. 5
0
    def run_one_train_episode(self, env) -> dict:
        state = env.reset()
        p_actual, p_reference = state
        action = self.agent.use(state)
        output = {
            'p_actual': [p_actual],
            'p_reference': [p_reference],
            'action': [action]
        }

        for i in range(self.max_n_steps):
            next_state, reward, done, _ = env.step(action)
            p_actual, p_reference = next_state
            output['p_actual'].append(p_actual)
            output['p_reference'].append(p_reference)
            output['action'].append(action)

            action = self.agent.learn(state, action, reward, next_state)
            state = next_state
            if done:
                break
        mse = calc_mse(output['p_actual'], output['p_reference'])
        output['mse'] = mse
        return output