Example #1
0
def test_add_adapted_experience_for_an_episode_long_action_length():
    """Tests that add_adapted_experience_for_an_episode works correctly for actions with length > 2"""
    buffer_size = 4
    memory_shaper = Memory_Shaper(
        buffer_size, buffer_size, seed, new_reward_fn=new_reward_fn)
    states = [0, 1, 2]
    next_states = [1, 10, 11]
    rewards = [10, 5, 2]
    actions = [0, 1, 2]
    dones = [False, False, False]
    memory_shaper.add_episode_experience(
        states, next_states, rewards, actions, dones)

    action_rules = {3: (0, 1, 2), 0: (0,), 1: (1,), 2: (2, )}

    replay_buffer = memory_shaper.put_adapted_experiences_in_a_replay_buffer(
        action_rules)

    assert len(replay_buffer.memories[0]) == 1
    assert len(replay_buffer.memories[1]) == 1
    assert len(replay_buffer.memories[2]) == 1
    assert len(replay_buffer.memories[3]) == 1

    s_states, s_actions, s_rewards, s_next_states, s_dones = replay_buffer.sample(
        separate_out_data_types=True)

    assert all(s_states.numpy() == np.array([[0.0], [1.0, ], [2.0], [0.0]]))
    assert all(s_actions.numpy() == np.array([[0.0], [1.0, ], [2.0], [3.0]]))
    assert np.allclose(s_rewards.numpy(), np.array(
        [[10.0], [5.0], [2.0], [new_reward_fn(17.0, 3)]]))
    assert all(s_next_states.numpy() == np.array(
        [[1.0], [10.0, ], [11.0], [11.0]]))
    assert all(s_dones.numpy() == np.array([[0.0], [0.0], [0.0], [0.0]]))
Example #2
0
def test_calculate_max_action_length():
    """Tests that calculate_max_action_length works correctly"""

    memory_shaper = Memory_Shaper(
        buffer_size, batch_size, seed, new_reward_fn=new_reward_fn)
    action_rules = {(0, 2, 33, 1, 22, 0, 0): 99, (0, 4): 2, (0, 9): 100}
    assert memory_shaper.calculate_max_action_length(action_rules) == 7

    action_rules = {(0, 2, 3): 99, (0, 4, 0, 0): 2, (0, 9): 100}
    assert memory_shaper.calculate_max_action_length(action_rules) == 4
Example #3
0
def test_add_adapted_experience_for_multiple_episodes():
    """Tests that add_adapted_experience_for_an_episode works correctly for multiple episodes"""
    # for reward_increment in [0.0, 0.5, 1.5]:
    buffer_size = 6

    memory_shaper = Memory_Shaper(buffer_size, 6, seed, new_reward_fn)
    states = [0]
    next_states = [1]
    rewards = [10]
    actions = [0]
    dones = [False]
    memory_shaper.add_episode_experience(states, next_states, rewards, actions,
                                         dones)

    states = [1]
    next_states = [2]
    rewards = [11]
    actions = [1]
    dones = [True]
    memory_shaper.add_episode_experience(states, next_states, rewards, actions,
                                         dones)

    states = [1, 2]
    next_states = [2, 3]
    rewards = [11, 2]
    actions = [0, 1]
    dones = [False, True]
    memory_shaper.add_episode_experience(states, next_states, rewards, actions,
                                         dones)

    action_rules = {0: (0, ), 1: (1, ), 2: (0, 1)}

    replay_buffer = memory_shaper.put_adapted_experiences_in_a_replay_buffer(
        action_rules)

    assert len(replay_buffer.memories[0]) == 2
    assert len(replay_buffer.memories[1]) == 2
    assert len(replay_buffer.memories[2]) == 1

    s_states, s_actions, s_rewards, s_next_states, s_dones = replay_buffer.sample(
        separate_out_data_types=True)

    print(s_actions)
    print(s_rewards)
    print(s_dones)

    assert all(s_states.numpy() == np.array([[0.0], [1.0], [2.0], [1.0], [1.0],
                                             [1.0]]))
    assert all(
        s_actions.numpy() == np.array([[0.0], [0.0], [1.], [1.], [2.], [2.]]))
    assert np.allclose(
        s_rewards.numpy(),
        np.array([[10.0], [11.0], [2.0], [11.0], [new_reward_fn(13.0, 2)],
                  [new_reward_fn(13.0, 2)]]))
    assert all(s_next_states.numpy() == np.array([[1.0], [2.0], [3.0], [2.0],
                                                  [3.0], [3.0]]))
    assert all(
        s_dones.numpy() == np.array([[0.0], [0.0], [1.0], [1.], [1.], [1.]]))
    def __init__(self,
                 config,
                 global_action_id_to_primitive_action,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.state_size += 1

        self.q_network_local = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size)
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.min_episode_score_seen = float("inf")
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_action = global_action_id_to_primitive_action
        self.action_id_to_stepping_stone_action_id = {}
        self.calculate_q_values_as_increments = self.config.hyperparameters[
            "calculate_q_values_as_increments"]
        self.abandon_ship = self.config.hyperparameters["abandon_ship"]
        self.pre_training_learning_iterations_multiplier = self.hyperparameters[
            "pre_training_learning_iterations_multiplier"]
        self.copy_over_hidden_layers = self.hyperparameters[
            "copy_over_hidden_layers"]
        self.action_balanced_replay_buffer = self.hyperparameters[
            "action_balanced_replay_buffer"]
        self.original_primitive_actions = list(range(self.action_size))
        self.memory_shaper = Memory_Shaper(
            self.hyperparameters["buffer_size"],
            self.hyperparameters["batch_size"], config.seed,
            self.update_reward_to_encourage_longer_macro_actions,
            self.action_balanced_replay_buffer)
        self.action_length_reward_bonus = self.hyperparameters[
            "action_length_reward_bonus"]
        self.only_train_new_actions = self.hyperparameters[
            "only_train_new_actions"]
        self.only_train_final_layer = self.hyperparameters[
            "only_train_final_layer"]
Example #5
0
def test_add_adapted_experience_for_an_episode():
    """Tests that add_adapted_experience_for_an_episode works correctly"""
    buffer_size = 3
    memory_shaper = Memory_Shaper(buffer_size,
                                  buffer_size,
                                  seed,
                                  new_reward_fn=new_reward_fn,
                                  action_balanced_replay_buffer=False)
    memory_shaper.reset()
    states = [0, 1]
    next_states = [1, 10]
    rewards = [10, 5]
    actions = [0, 5]
    dones = [False, True]
    memory_shaper.add_episode_experience(states, next_states, rewards, actions,
                                         dones)

    action_rules = {
        6: (0, 5),
        1: (1, ),
        2: (2, ),
        3: (3, ),
        4: (4, ),
        5: (5, ),
        0: (0, )
    }

    replay_buffer = memory_shaper.put_adapted_experiences_in_a_replay_buffer(
        action_rules)

    assert len(replay_buffer) == 3

    s_states, s_actions, s_rewards, s_next_states, s_dones = replay_buffer.sample(
        separate_out_data_types=True)

    assert all(s_states.numpy() == np.array([[0.0], [0.0], [1.0]]))
    assert all(s_actions.numpy() == np.array([[0.0], [6.0], [
        5.0,
    ]]))
    assert all(
        s_rewards.numpy() == np.array([[10.0], [new_reward_fn(15.0, 2)], [
            5.0,
        ]]))
    assert all(s_next_states.numpy() == np.array([[1.0], [10.0], [
        10.0,
    ]]))
    assert all(s_dones.numpy() == np.array([[0.0], [1.0], [1.0]]))

    buffer_size = 5
    memory_shaper = Memory_Shaper(buffer_size,
                                  buffer_size,
                                  seed,
                                  new_reward_fn=new_reward_fn,
                                  action_balanced_replay_buffer=False)
    memory_shaper.reset()
    states = [0, 1, 2]
    next_states = [1, 10, 11]
    rewards = [10, 5, -4]
    actions = [0, 5, 2]
    dones = [False, False, True]
    memory_shaper.add_episode_experience(states, next_states, rewards, actions,
                                         dones)
    action_rules = {
        6: (0, 5),
        7: (0, 5, 2),
        1: (1, ),
        2: (2, ),
        3: (3, ),
        4: (4, ),
        5: (5, ),
        0: (0, )
    }
    replay_buffer = memory_shaper.put_adapted_experiences_in_a_replay_buffer(
        action_rules)
    assert len(replay_buffer) == 5
    s_states, s_actions, s_rewards, s_next_states, s_dones = replay_buffer.sample(
        separate_out_data_types=True)
    assert all(
        s_states.numpy() == np.array([[1.0], [0.0], [0.0], [2.0], [0.0]]))
    assert all(
        s_actions.numpy() == np.array([[5.0], [0.0], [7.0], [2.0], [6.0]]))
    assert np.allclose(
        s_rewards.numpy(),
        np.array([[5.0], [10.0], [np.round(new_reward_fn(11.0, 3), 5)], [-4.0],
                  [new_reward_fn(15.0, 2)]]))
    assert all(s_next_states.numpy() == np.array([[10.0], [1.0], [11.0],
                                                  [11.0], [10.0]]))
    assert all(
        s_dones.numpy() == np.array([[0.0], [0.0], [1.0], [1.0], [0.0]]))
class DDQN_Wrapper(DDQN):
    def __init__(self,
                 config,
                 global_action_id_to_primitive_action,
                 end_of_episode_symbol="/"):
        super().__init__(config)
        self.state_size += 1

        self.q_network_local = self.create_NN(input_dim=self.state_size,
                                              output_dim=self.action_size)
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])
        self.q_network_target = self.create_NN(input_dim=self.state_size,
                                               output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)

        self.min_episode_score_seen = float("inf")
        self.end_of_episode_symbol = end_of_episode_symbol
        self.global_action_id_to_primitive_action = global_action_id_to_primitive_action
        self.action_id_to_stepping_stone_action_id = {}
        self.calculate_q_values_as_increments = self.config.hyperparameters[
            "calculate_q_values_as_increments"]
        self.abandon_ship = self.config.hyperparameters["abandon_ship"]
        self.pre_training_learning_iterations_multiplier = self.hyperparameters[
            "pre_training_learning_iterations_multiplier"]
        self.copy_over_hidden_layers = self.hyperparameters[
            "copy_over_hidden_layers"]
        self.action_balanced_replay_buffer = self.hyperparameters[
            "action_balanced_replay_buffer"]
        self.original_primitive_actions = list(range(self.action_size))
        self.memory_shaper = Memory_Shaper(
            self.hyperparameters["buffer_size"],
            self.hyperparameters["batch_size"], config.seed,
            self.update_reward_to_encourage_longer_macro_actions,
            self.action_balanced_replay_buffer)
        self.action_length_reward_bonus = self.hyperparameters[
            "action_length_reward_bonus"]
        self.only_train_new_actions = self.hyperparameters[
            "only_train_new_actions"]
        self.only_train_final_layer = self.hyperparameters[
            "only_train_final_layer"]

    def update_agent(self, global_action_id_to_primitive_action,
                     new_actions_just_added):
        """Updates the agent according to new action set by changing its action set, creating a new replay buffer
        and doing any pretraining"""
        current_num_actions = len(global_action_id_to_primitive_action.keys())
        PRE_TRAINING_ITERATIONS = int(
            self.pre_training_learning_iterations_multiplier)
        self.update_agent_for_new_actions(
            global_action_id_to_primitive_action,
            copy_over_hidden_layers=self.copy_over_hidden_layers,
            change_or_append_final_layer="APPEND")
        if len(new_actions_just_added) > 0:
            replay_buffer = self.memory_shaper.put_adapted_experiences_in_a_replay_buffer(
                global_action_id_to_primitive_action)
            self.overwrite_replay_buffer_and_pre_train_agent(
                replay_buffer,
                PRE_TRAINING_ITERATIONS,
                only_train_final_layer=self.only_train_final_layer,
                only_train_new_actions=self.only_train_new_actions,
                new_actions_just_added=new_actions_just_added)
        print("Now there are {} actions: {}".format(
            current_num_actions, self.global_action_id_to_primitive_action))

    def overwrite_replay_buffer_and_pre_train_agent(self, replay_buffer,
                                                    training_iterations,
                                                    only_train_final_layer,
                                                    only_train_new_actions,
                                                    new_actions_just_added):
        """Overwrites the replay buffer of the agent and sets it to the provided replay_buffer. Then trains the agent
        for training_iterations number of iterations using data from the replay buffer"""
        assert replay_buffer is not None
        self.memory = replay_buffer
        if only_train_final_layer:
            print("Only training the final layer")
            self.freeze_all_but_output_layers(self.q_network_local)

        for g in self.q_network_optimizer.param_groups:
            g['lr'] = self.hyperparameters["learning_rate"] / 100.0
        for _ in range(training_iterations):
            if only_train_new_actions: new_actions = new_actions_just_added
            else: new_actions = []
            self.learn(print_loss=False, only_these_actions=new_actions)
        for g in self.q_network_optimizer.param_groups:
            g['lr'] = self.hyperparameters["learning_rate"]
        if only_train_final_layer:
            self.unfreeze_all_layers(self.q_network_local)

    def update_agent_for_new_actions(self,
                                     global_action_id_to_primitive_action,
                                     copy_over_hidden_layers,
                                     change_or_append_final_layer):
        assert change_or_append_final_layer in ["CHANGE", "APPEND"]
        num_actions_before = self.action_size
        self.global_action_id_to_primitive_action = global_action_id_to_primitive_action
        self.action_size = len(global_action_id_to_primitive_action)
        num_new_actions = self.action_size - num_actions_before
        if num_new_actions > 0:
            for new_action_id in range(num_actions_before,
                                       num_actions_before + num_new_actions):
                self.update_action_id_to_stepping_stone_action_id(
                    new_action_id)
            if change_or_append_final_layer == "CHANGE":
                self.change_final_layer_q_network(copy_over_hidden_layers)
            else:
                self.append_to_final_layers(num_new_actions)

    def update_action_id_to_stepping_stone_action_id(self, new_action_id):
        """Update action_id_to_stepping_stone_action_id with the new actions created"""
        new_action = self.global_action_id_to_primitive_action[new_action_id]
        length_macro_action = len(new_action)
        print(" update_action_id_to_stepping_stone_action_id ")
        for sub_action_length in reversed(range(1, length_macro_action)):
            sub_action = new_action[:sub_action_length]
            if sub_action in self.global_action_id_to_primitive_action.values(
            ):
                sub_action_id = list(
                    self.global_action_id_to_primitive_action.keys())[list(
                        self.global_action_id_to_primitive_action.values(
                        )).index(sub_action)]

                self.action_id_to_stepping_stone_action_id[
                    new_action_id] = sub_action_id
                print("Action {} has largest sub action {}".format(
                    new_action_id, sub_action_id))
                break

    def append_to_final_layers(self, num_new_actions):
        """Appends to the end of a network to allow it to choose from the new actions. It does not change the weights
        for the other actions"""
        print("Appending options to final layer")
        assert num_new_actions > 0
        self.q_network_local.output_layers.append(
            nn.Linear(
                in_features=self.q_network_local.output_layers[0].in_features,
                out_features=num_new_actions))
        self.q_network_target.output_layers.append(
            nn.Linear(
                in_features=self.q_network_local.output_layers[0].in_features,
                out_features=num_new_actions))
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])

    def change_final_layer_q_network(self, copy_over_hidden_layers):
        """Completely changes the final layer of the q network to accomodate the new action space"""
        print("Completely changing final layer")
        assert len(self.q_network_local.output_layers) == 1
        if copy_over_hidden_layers:
            self.q_network_local.output_layers[0] = nn.Linear(
                in_features=self.q_network_local.output_layers[0].in_features,
                out_features=self.action_size)
            self.q_network_target.output_layers[0] = nn.Linear(
                in_features=self.q_network_target.output_layers[0].in_features,
                out_features=self.action_size)
        else:
            self.q_network_local = self.create_NN(input_dim=self.state_size,
                                                  output_dim=self.action_size)
            self.q_network_target = self.create_NN(input_dim=self.state_size,
                                                   output_dim=self.action_size)
        Base_Agent.copy_model_over(from_model=self.q_network_local,
                                   to_model=self.q_network_target)
        self.q_network_optimizer = optim.Adam(
            self.q_network_local.parameters(),
            lr=self.hyperparameters["learning_rate"])

    def run_n_episodes(self, num_episodes,
                       episodes_to_run_with_no_exploration):
        self.turn_on_any_epsilon_greedy_exploration()

        self.round_of_macro_actions = []

        self.episode_actions_scores_and_exploration_status = []
        num_episodes_to_get_to = self.episode_number + num_episodes
        while self.episode_number < num_episodes_to_get_to:
            self.reset_game()
            self.step()
            self.save_and_print_result()
            if num_episodes_to_get_to - self.episode_number == episodes_to_run_with_no_exploration:
                self.turn_off_any_epsilon_greedy_exploration()

        assert len(self.episode_actions_scores_and_exploration_status
                   ) == num_episodes, "{} vs. {}".format(
                       len(self.episode_actions_scores_and_exploration_status),
                       num_episodes)
        assert len(self.episode_actions_scores_and_exploration_status[0]) == 3
        assert self.episode_actions_scores_and_exploration_status[0][2] in [
            True, False
        ]
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][1], list)
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][1][0], int)
        assert isinstance(
            self.episode_actions_scores_and_exploration_status[0][0],
            int) or isinstance(
                self.episode_actions_scores_and_exploration_status[0][0],
                float)

        return self.episode_actions_scores_and_exploration_status, self.round_of_macro_actions

    def learn(self, experiences=None, print_loss=False, only_these_actions=[]):
        """Runs a learning iteration for the Q network"""
        if len(only_these_actions) == 0:
            super().learn()
            # self.learn_predict_next_state()
        else:
            experiences = self.memory.sample_experiences_with_certain_actions(
                only_these_actions, self.action_size,
                int(self.hyperparameters["batch_size"]))
            super().learn(experiences=experiences)

    def update_reward_to_encourage_longer_macro_actions(
            self, cumulative_reward, length_of_macro_action):
        """Update reward to encourage usage of longer macro actions. The size of the improvement depends positively
        on the length of the macro action"""
        if cumulative_reward == 0.0: increment = 0.1
        else: increment = abs(cumulative_reward)
        total_change = increment * ((length_of_macro_action - 1)**
                                    0.5) * self.action_length_reward_bonus
        cumulative_reward += total_change
        return cumulative_reward

    def step(self):
        """Runs a step within a game including a learning step if required"""
        self.total_episode_score_so_far = 0
        step_number = 0.0
        self.state = np.append(
            self.state, step_number /
            200.0)  #Divide by 200 because there are 200 steps in cart pole

        macro_state = self.state
        state = self.state
        done = self.done

        episode_macro_actions = []

        while not done:
            macro_action = self.pick_action(state=macro_state)
            primitive_actions = self.global_action_id_to_primitive_action[
                macro_action]
            macro_reward = 0
            primitive_actions_conducted = 0
            for action in primitive_actions:

                if self.abandon_ship:

                    if primitive_actions_conducted >= 1:

                        if isinstance(state, np.int64) or isinstance(
                                state, int):
                            state_tensor = np.array([state])
                        else:
                            state_tensor = state
                        state_tensor = torch.from_numpy(
                            state_tensor).float().unsqueeze(0).to(self.device)

                        with torch.no_grad():
                            q_values = self.calculate_q_values(
                                self.q_network_local(
                                    state_tensor))[:, :self.get_action_size()]
                        q_value_highest = torch.max(q_values)
                        q_values_action = q_values[:, action]

                        if q_value_highest == 0.0:
                            increment = 1.0
                        else:
                            increment = abs(q_value_highest)

                        max_difference = 0.2 * increment
                        if q_values_action + max_difference < q_value_highest:
                            # print("BREAKING Action {} -- Q Values {}".format(action, q_values))
                            macro_reward -= 0.25  #punish agent for picking macro action that it had to pull out of
                            # break
                            print(
                                "Changing Course of Action {} to {} -- Q Values {}"
                                .format(action, torch.argmax(q_values),
                                        q_values))
                            action = torch.argmax(q_values).item()

                step_number += 1

                next_state, reward, done, _ = self.environment.step(action)
                self.total_episode_score_so_far += reward
                if self.hyperparameters["clip_rewards"]:
                    reward = max(min(reward, 1.0), -1.0)
                macro_reward += reward
                primitive_actions_conducted += 1
                next_state = np.append(
                    next_state, step_number / 200.0
                )  #Divide by 200 because there are 200 steps in cart pole
                self.track_episodes_data(state, action, reward, next_state,
                                         done)

                self.save_experience(experience=(state, action, reward,
                                                 next_state, done))

                state = next_state
                if self.time_for_q_network_to_learn():
                    for _ in range(
                            self.hyperparameters["learning_iterations"]):
                        self.learn()
                if done: break

            macro_reward = self.update_reward_to_encourage_longer_macro_actions(
                macro_reward, primitive_actions_conducted)
            macro_next_state = next_state
            macro_done = done
            if macro_action != action:
                self.save_experience(experience=(macro_state, macro_action,
                                                 macro_reward,
                                                 macro_next_state, macro_done))
            macro_state = macro_next_state

            episode_macro_actions.append(macro_action)
            self.round_of_macro_actions.append(macro_action)
        if random.random() < 0.1: print(Counter(episode_macro_actions))

        self.store_episode_in_memory_shaper()
        self.save_episode_actions_with_score()
        self.episode_number += 1

    def track_episodes_data(self, state, action, reward, next_state, done):
        self.episode_states.append(state)
        self.episode_rewards.append(reward)
        self.episode_actions.append(action)
        self.episode_next_states.append(next_state)
        self.episode_dones.append(done)

    def save_episode_actions_with_score(self):

        self.episode_actions_scores_and_exploration_status.append([
            self.total_episode_score_so_far,
            self.episode_actions + [self.end_of_episode_symbol],
            self.turn_off_exploration
        ])

    def store_episode_in_memory_shaper(self):
        """Stores the raw state, next state, reward, done and action information for the latest full episode"""
        self.memory_shaper.add_episode_experience(self.episode_states,
                                                  self.episode_next_states,
                                                  self.episode_rewards,
                                                  self.episode_actions,
                                                  self.episode_dones)

    def calculate_q_values(self, network_action_values):

        if not self.calculate_q_values_as_increments:
            return network_action_values

        for action_id in range(self.action_size):
            if action_id in self.action_id_to_stepping_stone_action_id.keys():
                stepping_stone_id = self.action_id_to_stepping_stone_action_id[
                    action_id]
                # should do this with no grad? Or grad?
                with torch.no_grad():
                    network_action_values[:,
                                          action_id] += network_action_values[:,
                                                                              stepping_stone_id]  #.detach()
        # assert network_action_values.shape[0] in set([self.hyperparameters["batch_size"], 1])
        assert network_action_values.shape[1] == self.action_size
        return network_action_values

    def pick_action(self, state=None):
        """Uses the local Q network and an epsilon greedy policy to pick an action"""
        # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
        # a "fake" dimension to make it a mini-batch rather than a single observation
        if state is None: state = self.state
        if isinstance(state, np.int64) or isinstance(state, int):
            state = np.array([state])
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        if len(state.shape) < 2: state = state.unsqueeze(0)
        self.q_network_local.eval()  #puts network in evaluation mode
        with torch.no_grad():
            action_values = self.calculate_q_values(
                self.q_network_local(state))
        self.q_network_local.train()  #puts network back in training mode
        action = self.exploration_strategy.perturb_action_for_exploration_purposes(
            {
                "action_values": action_values,
                "turn_off_exploration": self.turn_off_exploration,
                "episode_number": self.episode_number
            })
        self.logger.info("Q values {} -- Action chosen {}".format(
            action_values, action))
        return action

    def compute_q_values_for_next_states(self, next_states):
        """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
        uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
        The reasoning behind this is that it will help stop the network from overestimating q values"""
        max_action_indexes = self.calculate_q_values(
            self.q_network_local(next_states)).detach().argmax(1)
        Q_targets_next = self.calculate_q_values(
            self.q_network_target(next_states)).gather(
                1, max_action_indexes.unsqueeze(1))
        return Q_targets_next

    def compute_expected_q_values(self, states, actions):
        """Computes the expected q_values we will use to create the loss to train the Q network"""
        # must convert actions to long so can be used as index
        Q_expected = self.calculate_q_values(
            self.q_network_local(states)).gather(1, actions.long())
        return Q_expected