Exemple #1
0
def test_normal_buffer_her():

    config = json.load(open("config/model/conv_fetch_minigrid.json", 'r'))
    config = config["algo_params"]["experience_replay_config"]
    env = DummyEnv(4)

    config["use_her"] = True
    config["use_compression"] = False
    config["prioritze"] = False
    config["size"] = 10
    config["n_step"] = 1
    config["gamma"] = 0.99
    config["num_workers"] = 1

    buffer = ReplayBufferParallel(config, is_recurrent=False, env=env)

    mission = [42, 42]
    action = 0
    reward = 0
    for elem in range(1, 7):
        done = False
        hindsight_mission = None
        reward = 0
        if elem % 3 == 0:
            # print("done")
            done = True
            hindsight_mission = [11, 11]
            reward = 0

        buffer.add_transition(elem, action, reward, elem + 1, done, mission, 2,
                              hindsight_mission)
        # print("len : {}, Position : {}, current_episode_size_in_buffer {}, current_step {}".format(
        #     len(buffer), buffer.dataset.position, len(buffer.current_episode), elem))
        # print("buffer memory", buffer.dataset.memory)
        if elem == 4:
            assert buffer.dataset.memory[0].current_state == 1
            assert buffer.dataset.memory[0].next_state == 2
            assert buffer.dataset.memory[0].mission == [11, 11]

            assert buffer.dataset.memory[3].current_state == 1
            assert buffer.dataset.memory[3].next_state == 2
            assert buffer.dataset.memory[3].mission == [42, 42]

            assert len(buffer) == 6
            assert buffer.dataset.position == 6

    assert buffer.dataset.memory[0].current_state == 4
    assert buffer.dataset.memory[0].next_state == 5
    assert buffer.dataset.memory[0].mission == [42, 42]

    assert len(buffer) == 9
    assert buffer.dataset.position == 3
Exemple #2
0
def test_normal_buffer_no_her():

    config = json.load(open("config/model/conv_fetch_minigrid.json", 'r'))
    config = config["algo_params"]["experience_replay_config"]

    config["use_her"] = False
    config["use_compression"] = False
    config["prioritize"] = False
    config["size"] = 10
    config["n_step"] = 1
    config["gamma"] = 0.99
    config["num_workers"] = 1
    config["batch_size"] = 2

    env = DummyEnv(4)

    buffer = ReplayBufferParallel(config, is_recurrent=False, env=env)

    mission = [42, 42]
    action = 0
    reward = 0
    for elem in range(1, 15):
        done = False
        hindsight_mission = None
        reward = 0
        if elem % 3 == 0:
            # print("done")
            done = True
            hindsight_mission = [11, 11]
            reward = 0

        buffer.add_transition(elem, action, reward, elem + 1, done, mission, 2,
                              hindsight_mission)
        # print("len : {}, Position : {}, current_episode_size_in_buffer {}, current_step {}".format(
        #     len(buffer), buffer.dataset.position, len(buffer.current_episode), elem))
        # print("buffer memory", buffer.dataset.memory)

    assert buffer.dataset.memory[0].current_state == 10
    assert buffer.dataset.memory[0].next_state == 11
    assert buffer.dataset.memory[0].mission == [42, 42]

    assert len(buffer) == 9
    assert buffer.dataset.position == 3
    assert buffer.batch_sampler.n_memory_cell == 9

    s, w = buffer.sample()
    assert w == 1
    assert buffer.batch_sampler.prioritize_p.sum() == 9
    assert buffer.batch_sampler.prioritize_proba.sum() == 1

    assert np.all(buffer.batch_sampler.prioritize_p == [1] * 9 + [0])
    assert np.all(buffer.batch_sampler.prioritize_proba == [1 / 9] * 9)

    buffer.update_transitions_proba(np.array([10, 10]))

    s, w = buffer.sample()
    assert w == 1
    assert np.all(buffer.batch_sampler.prioritize_p == [1] * 9 + [0])
    assert np.all(buffer.batch_sampler.prioritize_proba == [1 / 9] * 9)
Exemple #3
0
def test_recurrent_buffer_her_nstep():
    config = json.load(open("config/model/conv_vizdoom_recurrent.json", 'r'))
    config = config["algo_params"]["experience_replay_config"]

    config["use_her"] = True
    config["use_compression"] = False
    config["prioritize"] = True
    config["size"] = 10
    config["batch_size"] = 2
    config["n_step"] = 3
    config["gamma"] = 0.99
    config["num_workers"] = 1
    env = DummyEnv(4)

    buffer = ReplayBufferParallel(config,
                                  is_recurrent=True,
                                  env=env,
                                  recurrent_memory_saving=2)
    action = 0
    mission = [42, 42]

    for elem in range(1, 9):
        done = False
        hindsight_mission = None
        reward = 0
        if elem % 4 == 0:
            done = True
            hindsight_mission = [11, 11]
            reward = 0

        buffer.add_transition(np.array([[elem]]), action, reward,
                              np.array([[elem + 1]]), done, mission,
                              np.array([2]), hindsight_mission)
        # print("len : {}, Position : {}, current_episode_size_in_buffer {}, current_step {}".format(
        #     len(buffer), buffer.position, len(buffer.current_episode), elem))
        # print("buffer memory", buffer.memory)

        if elem == 4:

            print(buffer.dataset.memory)
            assert buffer.dataset.n_memory_cell == 2
            assert buffer.dataset.memory[0][0].current_state == 1
            assert buffer.dataset.memory[0][0].next_state == 4
            assert buffer.dataset.memory[0][0].terminal == False
            assert buffer.dataset.memory[0][0].reward == 0
            assert buffer.dataset.memory[0][0].mission == [11, 11]
            assert buffer.dataset.memory[0][0].gamma == 0.99**3

            assert buffer.dataset.memory[0][1].current_state == 2
            assert buffer.dataset.memory[0][1].next_state == 5
            assert buffer.dataset.memory[0][1].terminal == True
            assert buffer.dataset.memory[0][1].reward == 0.99**2
            assert buffer.dataset.memory[0][1].mission == [11, 11]
            assert buffer.dataset.memory[0][1].gamma == 0.99**3

            assert buffer.dataset.memory[0][2].current_state == 3
            assert buffer.dataset.memory[0][2].next_state == 5
            assert buffer.dataset.memory[0][2].terminal == True
            assert buffer.dataset.memory[0][2].reward == 0.99
            assert buffer.dataset.memory[0][2].mission == [11, 11]
            assert buffer.dataset.memory[0][2].gamma == 0.99**2

            assert buffer.dataset.memory[0][3].current_state == 4
            assert buffer.dataset.memory[0][3].next_state == 5
            assert buffer.dataset.memory[0][3].terminal == True
            assert buffer.dataset.memory[0][3].reward == 1
            assert buffer.dataset.memory[0][3].mission == [11, 11]
            assert buffer.dataset.memory[0][3].gamma == 0.99

            assert buffer.dataset.memory[1][0].current_state == 1
            assert buffer.dataset.memory[1][0].next_state == 4
            assert buffer.dataset.memory[1][0].terminal == False
            assert buffer.dataset.memory[1][0].mission == [42, 42]
            assert buffer.dataset.memory[1][0].reward == 0
            assert buffer.dataset.memory[1][0].gamma == 0.99**3

            assert buffer.dataset.memory[1][2].current_state == 3
            assert buffer.dataset.memory[1][2].next_state == 5
            assert buffer.dataset.memory[1][2].terminal == True
            assert buffer.dataset.memory[1][2].reward == 0
            assert buffer.dataset.memory[1][2].mission == [42, 42]
            assert buffer.dataset.memory[1][2].gamma == 0.99**2

    s, w = buffer.sample()
    id_sampled = buffer.batch_sampler.last_id_sampled
    assert np.all(w == [1, 1])
    assert buffer.batch_sampler.prioritize_proba.sum() == 1
    assert np.all(buffer.batch_sampler.prioritize_proba == [1 / 3] * 3)
    assert len(buffer) == 3
    print(buffer.dataset.memory)

    buffer.update_transitions_proba([5, 7.5, 10, 5, 7.5, 10])
    p = np.ones(5)
    p[4] = 0
    p[3] = 0

    p[id_sampled] = (10 * 0.9 + 7.5 * 0.1)**0.9

    assert np.isclose(p, buffer.batch_sampler.prioritize_p, atol=0.1).all()
Exemple #4
0
def test_normal_buffer_her_prio_nstep():

    config = json.load(open("config/model/conv_fetch_minigrid.json", 'r'))
    config = config["algo_params"]["experience_replay_config"]
    env = DummyEnv(4)

    config["use_her"] = True
    config["use_compression"] = False
    config["prioritze"] = True

    config["size"] = 10
    config["n_step"] = 3
    config["gamma"] = 0.99
    config["num_workers"] = 1
    config["batch_size"] = 2

    buffer = ReplayBufferParallel(config, is_recurrent=False, env=env)

    mission = [42, 42]
    action = 0
    reward = 0
    for elem in range(1, 9):
        done = False
        hindsight_mission = None
        reward = 0
        if elem % 4 == 0:
            # print("done")
            done = True
            hindsight_mission = [11, 11]
            reward = 0

        buffer.add_transition(elem, action, reward, elem + 1, done, mission, 2,
                              hindsight_mission)
        # print("len : {}, Position : {}, current_episode_size_in_buffer {}, current_step {}".format(
        #     len(buffer), buffer.dataset.position, len(buffer.current_episode), elem))
        # print("buffer memory", buffer.dataset.memory)
        if elem == 4:

            assert buffer.dataset.memory[5].current_state == 2
            assert buffer.dataset.memory[5].next_state == 5
            assert buffer.dataset.memory[5].mission == [42, 42]
            assert buffer.dataset.memory[5].reward == 0
            assert buffer.dataset.memory[5].gamma == 0.99**3

            assert len(buffer) == 8
            assert buffer.dataset.position == 8

    assert buffer.dataset.memory[1].current_state == 6
    assert buffer.dataset.memory[1].next_state == 9
    assert buffer.dataset.memory[1].mission == [11, 11]
    assert buffer.dataset.memory[1].reward == 0.99**2
    assert buffer.dataset.memory[1].terminal == True
    assert buffer.dataset.memory[1].gamma == 0.99**3

    assert len(buffer) == 8
    assert buffer.dataset.position == 8

    s, w = buffer.sample()
    id_updated = buffer.batch_sampler.last_id_sampled

    assert np.all(w == [1, 1])
    assert buffer.batch_sampler.prioritize_p.sum() == 8
    assert buffer.batch_sampler.prioritize_proba.sum() == 1

    assert np.all(buffer.batch_sampler.prioritize_p == [1] * 8 + [0, 0])
    assert np.all(buffer.batch_sampler.prioritize_proba == [1 / 8] * 8)

    buffer.update_transitions_proba(np.array([10, 10]))
    s, w = buffer.sample()

    id_updated2 = buffer.batch_sampler.last_id_sampled

    p = np.ones(10)
    p[id_updated] = 10**0.9
    p[-2:] = 0
    proba = p[:8] / p.sum()

    assert np.isclose(buffer.batch_sampler.prioritize_p, p).all()
    assert np.isclose(buffer.batch_sampler.prioritize_proba, proba).all()

    buffer.update_transitions_proba(np.array([3, 3]))
    buffer.sample()

    p = np.ones(10)
    p[id_updated] = 10**0.9
    p[id_updated2] = 3**0.9
    p[-2:] = 0
    proba = p[:8] / p.sum()

    assert np.isclose(buffer.batch_sampler.prioritize_p, p).all()
    assert np.isclose(buffer.batch_sampler.prioritize_proba, proba).all()

    for elem in range(1, 9):
        done = False
        hindsight_mission = None
        reward = 0
        if elem % 4 == 0:
            # print("done")
            done = True
            hindsight_mission = [11, 11]
            reward = 0

        buffer.add_transition(elem, action, reward, elem + 1, done, mission, 2,
                              hindsight_mission)
        if elem == 4:

            assert buffer.dataset.memory[5].current_state == 2
            assert buffer.dataset.memory[5].next_state == 5
            assert buffer.dataset.memory[5].mission == [42, 42]
            assert buffer.dataset.memory[5].reward == 0
            assert buffer.dataset.memory[5].gamma == 0.99**3

            assert len(buffer) == 8
            assert buffer.dataset.position == 8

    s, w = buffer.sample()
    assert np.all(w == [1, 1])
    assert buffer.batch_sampler.prioritize_proba.sum() == 1
    assert np.all(buffer.batch_sampler.prioritize_proba == [1 / 8] * 8)
    def __init__(self,
                 env,
                 config,
                 test_env=None,
                 logger=None,
                 visualizer=None,
                 device='cpu'):
        super(BaseDoubleDQN, self).__init__()

        if config["architecture"] == "conv":
            nn_creator = MinigridConvPolicy
        elif config["architecture"] == "conv_lstm":
            nn_creator = MinigridRecurrentPolicy
        else:
            nn_creator = MlpNet

        self.env = env
        self.test_env = test_env
        self.tf_logger = logger
        self.q_values_visualizer = visualizer

        self.policy_net = nn_creator(obs_space=env.observation_space,
                                     action_space=env.action_space,
                                     config=config["architecture_params"],
                                     device=device)
        self.target_net = nn_creator(obs_space=env.observation_space,
                                     action_space=env.action_space,
                                     config=config["architecture_params"],
                                     device=device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimize = config["optimize"] if "optimize" in config else True
        if config["optimizer"].lower() == 'rmsprop':
            self.optimizer = torch.optim.RMSprop(
                self.policy_net.parameters(),
                lr=config["lr"],
                weight_decay=config["weight_decay"])
        elif config["optimizer"].lower() == 'adam':
            self.optimizer = torch.optim.Adam(
                self.policy_net.parameters(),
                lr=config["lr"],
                weight_decay=config["weight_decay"])
        else:
            raise NotImplementedError("Optimizer '{}' doesn't exist".format(
                config["optimizer"].lower()))

        self.wait_steps_before_optim = config["wait_steps_before_optim"]
        self.n_optimize_per_step = config["n_optimize_per_step"]
        self.n_update_policy_net = 0

        self.batch_size = config["experience_replay_config"]["batch_size"]
        self.n_actions = env.action_space.n

        # Setting gamma and configuring n-step estimations
        config["experience_replay_config"]["n_step"] = config["n_step"]
        config["experience_replay_config"]["gamma"] = config["gamma"]

        is_recurrent = config["architecture"] == "conv_lstm"

        # Create replay buffer here
        if config["experience_replay_config"]["use_ther"]:
            replay_buffer = LearntHindsightExperienceReplay(
                config=config["experience_replay_config"],
                is_recurrent=is_recurrent,
                env=env,
                device=device,
                logger=logger)
        else:
            replay_buffer = ReplayBufferParallel(
                config=config["experience_replay_config"],
                is_recurrent=is_recurrent,
                env=env)

        self.replay_buffer = replay_buffer

        self.epsilon_init = 1
        self.epsilon_min = 0.04
        self.step_exploration = config["step_exploration"]
        self.current_epsilon = self.epsilon_init
        self.total_steps = 0

        self.grad_norm_limit = config["grad_norm_limit"]
        self.update_target_every = config["update_target_every"]
        self.n_update_target = 0

        self.device = device
        self.to(self.device)

        self.writer = logger
        self.PADDING_MISSION = 2  # Padding is always 2, checked by vocab