def run_one_test_episode(self, env) -> dict: state = env.reset() p_actual, p_reference = state state = np.array(state) eps = self.min_eps action_index = self.agent.get_action(state, eps) action = self.actions[action_index] output = {'p_actual': [p_actual], 'p_reference': [p_reference], 'action': [action]} for i in range(self.max_n_steps): next_state, reward, done, _ = env.step(action) p_actual, p_reference = next_state output['p_actual'].append(p_actual) output['p_reference'].append(p_reference) output['action'].append(action) state = np.array(next_state) action_index = self.agent.get_action(state, eps) action = self.actions[action_index] if done: break mse = calc_mse(output['p_actual'], output['p_reference']) output['mse'] = mse return output
def run_one_train_episode(self, env) -> dict: state = env.reset() p_actual, p_reference = state state = np.array(state) eps = epsilon_annealing(self.episode_counter, self.eps_decrease_last_episode, self.min_eps) action_index = self.agent.get_action(state, eps) action = self.actions[action_index] output = {'p_actual': [p_actual], 'p_reference': [p_reference], 'action': [action]} for i in range(self.max_n_steps): next_state, reward, done, _ = env.step(action) p_actual, p_reference = next_state output['p_actual'].append(p_actual) output['p_reference'].append(p_reference) output['action'].append(action) next_state = np.array(next_state) self.replay_memory.push(state, action_index, reward, next_state, done) if len(self.replay_memory) > self.batch_size: minibatch = self.replay_memory.pop(self.batch_size) train_helper(self.agent, minibatch, self.gamma) state = next_state action_index = self.agent.get_action(state, eps) action = self.actions[action_index] if done: break mse = calc_mse(output['p_actual'], output['p_reference']) output['mse'] = mse self.episode_counter += 1 return output
def run_one_test_episode(self, env) -> dict: state = env.reset() delta = 0 p_actual, p_reference = state action = self.agent.use([*state, delta]) output = { 'p_actual': [p_actual], 'p_reference': [p_reference], 'action': [action] } for i in range(self.max_n_steps): next_state, reward, done, _ = env.step(action) next_delta = next_state[0] - state[0] p_actual, p_reference = next_state output['p_actual'].append(p_actual) output['p_reference'].append(p_reference) output['action'].append(action) action = self.agent.use([*next_state, next_delta]) state = next_state if done: break mse = calc_mse(output['p_actual'], output['p_reference']) output['mse'] = mse return output
def run_one_train_episode(self, env) -> dict: state = env.reset() p_actual, p_reference = state output = { 'p_actual': [p_actual], 'p_reference': [p_reference], 'action': [self.k] } for i in range(self.max_steps): obs, rew, done, _ = env.step(self.k) p_actual, p_reference = obs output['p_actual'].append(p_actual) output['p_reference'].append(p_reference) output['action'].append(self.k) if done: break mse = calc_mse(output['p_actual'], output['p_reference']) output['mse'] = mse return output
def run_one_train_episode(self, env) -> dict: state = env.reset() p_actual, p_reference = state action = self.agent.use(state) output = { 'p_actual': [p_actual], 'p_reference': [p_reference], 'action': [action] } for i in range(self.max_n_steps): next_state, reward, done, _ = env.step(action) p_actual, p_reference = next_state output['p_actual'].append(p_actual) output['p_reference'].append(p_reference) output['action'].append(action) action = self.agent.learn(state, action, reward, next_state) state = next_state if done: break mse = calc_mse(output['p_actual'], output['p_reference']) output['mse'] = mse return output