class ALGEnv(gym.Env):
    metadata = {
        'render.modes':
        ['human', 'rgb_array', 'tiny_human', 'tiny_rgb_array', 'np_array']
    }

    def __init__(self,
                 dim_room=(10, 10),
                 num_boxes=4,
                 reset=True,
                 log_interval=1000,
                 alg_version=0,
                 train_mode='cnn',
                 agent_lb_path=None,
                 agent_ub_path=None,
                 init_probs=[0.5, 0.5, 0.5]):

        assert train_mode in TRAIN_MODES
        self.train_mode = train_mode
        if log_interval > 0:
            self.log_train_info = True
        else:
            self.log_train_info = False

        # 0: basic playable map
        # 1: playble map
        # 2: hardness adjustable map
        self.alg_version = alg_version
        if alg_version == 0:
            pass
        else:
            env_li = [
                lambda: SokobanEnv(dim_room=dim_room,
                                   max_steps=50,
                                   num_boxes=num_boxes,
                                   train_mode=train_mode,
                                   log_train_info=False)
            ]
            self.soko_env = DummyVecEnv(env_li)
            self.agent_ub = PPO.load(agent_ub_path, env=self.soko_env)
            print('loaded', agent_ub_path, 'as ub')
            if alg_version == 2:
                self.agent_lb = PPO.load(agent_lb_path, env=self.soko_env)
                print('loaded', agent_lb_path, 'as lb')

        # General Configuration
        self.dim_room = dim_room
        self.num_boxes = num_boxes
        self.num_players = 1

        # Training hyperperams
        self.max_prefer_subs = dim_room[0] * dim_room[1] // 2
        self.place_target_prob = init_probs[0]
        self.place_box_prob = init_probs[1]
        self.place_player_prob = init_probs[2]

        # Log info
        self.start_time = time.time()
        self.train_result_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
        self.fail_type_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
        # self.sample_map = False
        self.episode_reward = 0
        self.total_reward_per_log_interval = 0
        self.total_steps_per_log_interval = 0
        self.total_subs_per_log_interval = 0
        self.log_interval = log_interval
        self.reseted = False
        self.train_counter = 0

        # Env properties
        self.map = None

        # Penalties and Rewards
        self.penalty_sub_wrong_tile = -5
        self.penalty_exc_btp_tiles = -10
        self.penalty_bad_map_design = -50
        self.penalty_generation_fail = -50
        self.penalty_exc_subs = -10

        self.reward_neighbor_valid_tiles = 2
        self.reward_place_btp_tiles = 5
        self.reward_basic_playable = 40

        if alg_version == 1:
            # too hard or unsolvable
            self.penalty_agent_ub_thou = -30
            self.reward_agent_ub_solvable = 50
        elif alg_version == 2:
            self.penalty_agent_lb_solvable = -30
            self.penalty_agent_ub_thou = -30
            self.reward_agent_ub_solvable = 10
            self.reward_agent_lb_thou = 50

        # Generation Track
        self.placed_player = 0
        self.placed_boxes = 0
        self.placed_target = 0
        self.env_steps = 0

        # Env Settings
        self.viewer = None
        self.max_steps = dim_room[0] * dim_room[1]
        self.action_space = MultiDiscrete([dim_room[0], dim_room[1], 5])

        if train_mode == 'cnn':
            self.scale = 6
            screen_height, screen_width = (dim_room[0] * self.scale,
                                           dim_room[1] * self.scale)
            self.observation_space = Box(low=0,
                                         high=255,
                                         shape=(screen_height, screen_width,
                                                3),
                                         dtype=np.uint8)
        else:
            self.observation_space = Box(low=0,
                                         high=6,
                                         shape=(dim_room[0], dim_room[1]),
                                         dtype=np.uint8)

        if reset:
            # Initialize Room
            _ = self.reset()

    def random_init_map(self):
        room = np.zeros((self.dim_room[0], self.dim_room[1]), dtype=np.uint8)
        for _ in range(self.num_boxes):
            if np.random.rand(1) < self.place_target_prob:
                x, y = np.random.randint(1, self.dim_room[0] - 1, size=2)
                room[x, y] = 2
            if np.random.rand(1) < self.place_box_prob:
                x, y = np.random.randint(1, self.dim_room[0] - 1, size=2)
                room[x, y] = 4

        for _ in range(self.num_players):
            if np.random.rand(1) < self.place_player_prob:
                x, y = np.random.randint(1, self.dim_room[0] - 1, size=2)
                room[x, y] = 5

        self.placed_target += np.count_nonzero(room == 2)
        self.placed_boxes += np.count_nonzero(room == 4)
        self.placed_player += np.count_nonzero(room == 5)

        return room

    def reset(self):
        self.placed_player = 0
        self.placed_boxes = 0
        self.placed_target = 0
        self.map = self.random_init_map()
        self.env_steps = 0
        self.episode_subs = 0
        self.episode_reward = 0
        self.reseted = True

        if self.train_mode == 'cnn':
            starting_observation = self.render('tiny_rgb_array',
                                               scale=self.scale)
        else:
            starting_observation = self.render('np_array')
        return starting_observation

    def soko_agent_test(self):
        reward = 0

        # v1
        if self.alg_version == 1:
            done = False
            obs = self.soko_env.env_method('manual_reset', self.map)
            while not done:
                action, _ = self.agent_ub.predict(obs, deterministic=True)
                obs, _, done, info = self.soko_env.step(action)

            # agent_ub solvable
            if info[0]["all_boxes_on_target"]:
                reward += self.reward_agent_ub_solvable
                train_result = 0  # good map
            else:
                reward += self.penalty_agent_ub_thou
                train_result = 2  # thou map

        # v2
        else:
            done = False
            obs = self.soko_env.env_method('manual_reset', self.map)
            while not done:
                action, _ = self.agent_ub.predict(obs, deterministic=True)
                obs, _, done, info = self.soko_env.step(action)

            # agent_ub thou
            if not info[0]["all_boxes_on_target"]:
                reward += self.penalty_agent_ub_thou
                train_result = 2  # thou

            # agent_ub solvable
            else:
                reward += self.reward_agent_ub_solvable
                done = False
                obs = self.soko_env.env_method('manual_reset', self.map)
                while not done:
                    action, _ = self.agent_lb.predict(obs, deterministic=True)
                    obs, _, done, info = self.soko_env.step(action)

                # agent_lb solvable
                if info[0]["all_boxes_on_target"]:
                    reward += self.penalty_agent_lb_solvable
                    train_result = 1  # too easy
                else:
                    reward += self.reward_agent_lb_thou
                    train_result = 0  # good map

        return reward, train_result

    def step(self, action):
        '''
        Tile type:
            0: Wall
            1: Floor
            2: Target
            3: Box On Target
            4: Box
            5: Player
            6: Player On Target
        act:
            0: Finish Generation
            1: Floor
            2: Box Target
            3: Box
            4: Player
        '''
        x, y, act = action
        reward = 0
        done = False
        self.env_steps += 1
        # not finish generation
        if act != 0:
            if self.map[x][y] != 0:
                reward += self.penalty_sub_wrong_tile

            # is wall tile, can substitute
            else:
                for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1),
                                 (x + 1, y)]:
                    if _x in range(self.dim_room[0]) and _y in range(
                            self.dim_room[1]):
                        if self.map[_x, _y] != 0:
                            reward += self.reward_neighbor_valid_tiles

                if act == 1:
                    self.map[x][y] = 1
                    self.episode_subs += 1
                    if self.episode_subs >= self.max_prefer_subs:
                        reward += self.penalty_exc_subs
                        # print(self.episode_subs)

                # place box target
                elif act == 2:
                    if self.placed_target >= self.num_boxes:
                        reward += self.penalty_exc_btp_tiles
                    else:
                        self.placed_target += 1
                        self.map[x][y] = 2
                        self.episode_subs += 1
                        reward += self.reward_place_btp_tiles
                        if self.episode_subs >= self.max_prefer_subs:
                            reward += self.penalty_exc_subs
                            # print(self.episode_subs)

                # place box
                elif act == 3:
                    if self.placed_boxes >= self.num_boxes:
                        reward += self.penalty_exc_btp_tiles
                    else:
                        self.placed_boxes += 1
                        self.map[x][y] = 4
                        self.episode_subs += 1
                        reward += self.reward_place_btp_tiles
                        if self.episode_subs >= self.max_prefer_subs:
                            reward += self.penalty_exc_subs
                            # print(self.episode_subs)

                # place player
                elif act == 4:
                    if self.placed_player >= self.num_players:
                        reward += self.penalty_exc_btp_tiles
                    else:
                        self.placed_player += 1
                        self.map[x][y] = 5
                        self.episode_subs += 1
                        reward += self.reward_place_btp_tiles
                        if self.episode_subs >= self.max_prefer_subs:
                            reward += self.penalty_exc_subs
                            # print(self.episode_subs)

            if self.is_maxsteps():
                done = True

        # finished generation
        else:
            done = True

        if done:
            _train_result = -1  # not used for training
            _fail_type = -1  # not failed
            if (self.placed_player != self.num_players
                    or self.placed_boxes != self.num_boxes
                    or self.placed_target != self.num_boxes):
                reward += self.penalty_generation_fail
                _fail_type = 0  # wrong number btp tiles
            else:
                if not self.basic_playable(self.map):
                    reward += self.penalty_bad_map_design
                    _fail_type = 1  # not basic playable
                else:
                    reward += self.reward_basic_playable
                    if self.alg_version == 0:
                        _train_result = 0
                    else:
                        _train_reward, _train_result = self.soko_agent_test()
                        reward += _train_reward

        self.episode_reward += reward

        # Convert the observation to RGB frame
        if self.train_mode == 'cnn':
            observation = self.render(mode='tiny_rgb_array', scale=self.scale)
        else:
            observation = self.render(mode='np_array')

        info = {
            "coordinate": (x, y),
            "action": act,
            "curr_steps": self.env_steps,
        }

        if self.reseted:
            self.reseted = False
            self.train_counter += 1

        if done:
            info["total_steps"] = self.env_steps
            info["train_result"] = _train_result
            info['fail_type'] = _fail_type

            self.train_result_summary[_train_result] += 1
            self.fail_type_summary[_fail_type] += 1
            self.total_reward_per_log_interval += self.episode_reward
            self.total_steps_per_log_interval += self.env_steps
            self.total_subs_per_log_interval += self.episode_subs

            # if _fail_type == -1 and self.sample_map:
            #     print('Sample map:')
            #     print(self.map)
            #     print('*********************************************')
            # self.sample_map = False

            if self.log_train_info and self.train_counter % self.log_interval == 0:
                end_time = time.time()
                duration = end_time - self.start_time
                avg_reward = self.total_reward_per_log_interval / self.log_interval
                avg_steps = self.total_steps_per_log_interval / self.log_interval
                avg_subs = self.total_subs_per_log_interval / self.log_interval
                print('[{}] Summary'.format(self.train_counter))
                print('Duration: %.2fs' % (duration))
                print('Average reward current log interval: ', avg_reward)
                print('Average steps current log interval: ', avg_steps)
                print('Average subs current log interval: ', avg_subs)

                print('Good Map                  :',
                      self.train_result_summary[0])
                if self.alg_version == 2:
                    print('Too easy map              :',
                          self.train_result_summary[1])
                if self.alg_version != 0:
                    print('Too hard or unsolvable map:',
                          self.train_result_summary[2])
                print('Not for training map      :',
                      self.train_result_summary[-1])

                print('Generated wrong number of btp tiles:',
                      self.fail_type_summary[0])
                print('Generated not basic playable map   :',
                      self.fail_type_summary[1])
                print('Unable to finish by max step       :',
                      self.fail_type_summary[2])
                print('Succeeded generate map for training:',
                      self.fail_type_summary[-1])
                print('*********************************************')

                self.total_reward_per_log_interval = 0
                self.total_steps_per_log_interval = 0
                self.total_subs_per_log_interval = 0
                self.train_result_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
                self.fail_type_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
                self.sample_map = True
                self.start_time = time.time()

        return observation, reward, done, info

    def render(self, mode=None, close=None, scale=16):
        if mode is None:
            if self.train_mode == 'cnn':
                mode = 'human'
            else:
                mode = 'np_array'
        assert mode in RENDERING_MODES

        if 'rgb_array' in mode:
            img = self.get_image(mode, scale)
            return img

        elif 'np_array' in mode:
            return self.map

        elif 'human' in mode:
            from gym.envs.classic_control import rendering
            if self.viewer is None or not self.viewer.isopen:
                self.viewer = rendering.SimpleImageViewer()
            img = self.get_image(mode, scale)
            self.viewer.imshow(img)
            return self.viewer.isopen

        else:
            super(ALGEnv, self).render(mode=mode)  # just raise an exception

    def get_image(self, mode, scale=1):
        if mode.startswith('tiny_'):
            img = room_to_tiny_world_rgb(self.map, scale=scale)
        else:
            img = room_to_rgb(self.map)
        return img

    def basic_playable(self, room):
        # # player can reach all boxes and all targets
        # for player_coord in np.argwhere(room==5):
        #     des = np.concatenate((np.argwhere(room==2), np.argwhere(room==4)), axis=0)
        #     if not self.contaminate(room, player_coord, des):
        #         return False

        # player can reach all none wall tiles

        if not self.contaminate_room(room):
            return False

        # no three walls around any box
        if self.box_stuck(room):
            return False
        return True

    def box_stuck(self, room):
        room = deepcopy(room)
        room = np.pad(room, 1, 'constant', constant_values=0)
        for (x, y) in np.argwhere(room == 4):
            if (room[x - 1, y] == room[x, y - 1] == 0
                    or room[x - 1, y] == room[x, y + 1] == 0
                    or room[x + 1, y] == room[x, y - 1] == 0
                    or room[x + 1, y] == room[x, y - 1] == 0):
                return True
            num_wall = 0
            for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1), (x + 1, y)]:
                if room[_x, _y] == 0:
                    num_wall += 1
            if num_wall >= 3:
                return True
        return False

    # player can reach any none wall tile within room
    def contaminate_room(self, room):
        room = deepcopy(room)
        room = np.pad(room, 1, 'constant', constant_values=0)
        (x, y) = np.argwhere(room == 5)[0]
        room[room != 0] = 1
        room[x, y] = 5
        fixpoint = False
        while not fixpoint:
            fixpoint = True
            for (x, y) in np.argwhere(room == 5):
                for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1),
                                 (x + 1, y)]:
                    if room[_x, _y] not in [0, 5]:
                        room[_x, _y] = 5
                        fixpoint = False
        for i in [1, 2, 4]:
            if i in room:
                return False
        return True

    def contaminate(self, room, src, des):
        room = deepcopy(room)
        (x, y) = src
        src_tile = room[x, y]
        room[room != 0] = 1
        room[x, y] = src_tile
        fixpoint = False
        while not fixpoint:
            fixpoint = True
            for (x, y) in np.argwhere(room == src_tile):
                for (_x, _y) in [(x - 1, y), (x, y - 1), (x, y + 1),
                                 (x + 1, y)]:
                    if _x in range(self.dim_room[0]) and _y in range(
                            self.dim_room[1]):
                        if room[_x, _y] not in [0, src_tile]:
                            room[_x, _y] = src_tile
                            fixpoint = False
        reachable = True
        for (x, y) in des:
            if room[x, y] != src_tile:
                reachable = False
                break
        return reachable

    def is_maxsteps(self):
        return self.env_steps >= self.max_steps

    def deconstruct_map(self, obs_map):
        state_map = copy.deepcopy(obs_map)
        fix_map = copy.deepcopy(obs_map)
        state_map[state_map == 6] = 5
        fix_map[(fix_map == 3) | (fix_map == 6)] = 2
        fix_map[(fix_map == 4) | (fix_map == 5)] = 1
        return fix_map, state_map

    def assemble_map(self, state_map, fix_map):
        obs_map = copy.deepcopy(state_map)
        obs_map[(obs_map == 5) & (fix_map == 2)] = 6
        return obs_map

    def close(self):
        if self.viewer is not None:
            self.viewer.close()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
Ejemplo n.º 2
0
                env = ProbsVisualizationWrapper(env)
                env = Monitor(env, f'videos/{experiment_name}')
        env = wrap_pytorch(
            wrap_deepmind(
                env,
                clip_rewards=True,
                frame_stack=True,
                scale=False,
            )
        )
        env.seed(seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env
    return thunk
envs = VecPyTorch(DummyVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)]), device)
# if args.prod_mode:
#     envs = VecPyTorch(
#         SubprocVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)], "fork"),
#         device
#     )
assert isinstance(envs.action_space, Discrete), "only discrete action space is supported"

# ALGO LOGIC: initialize agent here:
class Scale(nn.Module):
    def __init__(self, scale):
        super().__init__()
        self.scale = scale

    def forward(self, x):
        return x * self.scale
def test_save_load(tmp_path, model_class):
    """
    Test if 'save' and 'load' saves and loads model correctly
    and if 'get_parameters' and 'set_parameters' and work correctly.

    ''warning does not test function of optimizer parameter load

    :param model_class: (BaseAlgorithm) A RL model
    """

    env = DummyVecEnv([lambda: select_env(model_class)])

    policy_kwargs = dict(net_arch=[16])

    if model_class in {QRDQN, TQC}:
        policy_kwargs.update(dict(n_quantiles=20))

    # create model
    model = model_class("MlpPolicy",
                        env,
                        verbose=1,
                        policy_kwargs=policy_kwargs)
    model.learn(total_timesteps=300)

    env.reset()
    observations = np.concatenate(
        [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)

    # Get parameters of different objects
    # deepcopy to avoid referencing to tensors we are about to modify
    original_params = deepcopy(model.get_parameters())

    # Test different error cases of set_parameters.
    # Test that invalid object names throw errors
    invalid_object_params = deepcopy(original_params)
    invalid_object_params[
        "I_should_not_be_a_valid_object"] = "and_I_am_an_invalid_tensor"
    with pytest.raises(ValueError):
        model.set_parameters(invalid_object_params, exact_match=True)
    with pytest.raises(ValueError):
        model.set_parameters(invalid_object_params, exact_match=False)

    # Test that exact_match catches when something was missed.
    missing_object_params = dict(
        (k, v) for k, v in list(original_params.items())[:-1])
    with pytest.raises(ValueError):
        model.set_parameters(missing_object_params, exact_match=True)

    # Test that exact_match catches when something inside state-dict
    # is missing but we have exact_match.
    missing_state_dict_tensor_params = {}
    for object_name in original_params:
        object_params = {}
        missing_state_dict_tensor_params[object_name] = object_params
        # Skip last item in state-dict
        for k, v in list(original_params[object_name].items())[:-1]:
            object_params[k] = v
    with pytest.raises(RuntimeError):
        # PyTorch load_state_dict throws RuntimeError if strict but
        # invalid state-dict.
        model.set_parameters(missing_state_dict_tensor_params,
                             exact_match=True)

    # Test that parameters do indeed change.
    random_params = {}
    for object_name, params in original_params.items():
        # Do not randomize optimizer parameters (custom layout)
        if "optim" in object_name:
            random_params[object_name] = params
        else:
            # Again, skip the last item in state-dict
            random_params[object_name] = OrderedDict(
                (param_name, th.rand_like(param))
                for param_name, param in list(params.items())[:-1])

    # Update model parameters with the new random values
    model.set_parameters(random_params, exact_match=False)

    new_params = model.get_parameters()
    # Check that all params except the final item in each state-dict are different.
    for object_name in original_params:
        # Skip optimizers (no valid comparison with just th.allclose)
        if "optim" in object_name:
            continue
        # state-dicts use ordered dictionaries, so key order
        # is guaranteed.
        last_key = list(original_params[object_name].keys())[-1]
        for k in original_params[object_name]:
            if k == last_key:
                # Should be same as before
                assert th.allclose(
                    original_params[object_name][k], new_params[object_name][k]
                ), "Parameter changed despite not included in the loaded parameters."
            else:
                # Should be different
                assert not th.allclose(
                    original_params[object_name][k], new_params[object_name]
                    [k]), "Parameters did not change as expected."

    params = new_params

    # get selected actions
    selected_actions, _ = model.predict(observations, deterministic=True)

    # Check
    model.save(tmp_path / "test_save.zip")
    del model

    # Check if the model loads as expected for every possible choice of device:
    for device in ["auto", "cpu", "cuda"]:
        model = model_class.load(str(tmp_path / "test_save.zip"),
                                 env=env,
                                 device=device)

        # check if the model was loaded to the correct device
        assert model.device.type == get_device(device).type
        assert model.policy.device.type == get_device(device).type

        # check if params are still the same after load
        new_params = model.get_parameters()

        # Check that all params are the same as before save load procedure now
        for object_name in new_params:
            # Skip optimizers (no valid comparison with just th.allclose)
            if "optim" in object_name:
                continue
            for key in params[object_name]:
                assert new_params[object_name][key].device.type == get_device(
                    device).type
                assert th.allclose(
                    params[object_name][key].to("cpu"),
                    new_params[object_name][key].to("cpu")
                ), "Model parameters not the same after save and load."

        # check if model still selects the same actions
        new_selected_actions, _ = model.predict(observations,
                                                deterministic=True)
        assert np.allclose(selected_actions, new_selected_actions, 1e-4)

        # check if learn still works
        model.learn(total_timesteps=300)

        del model

    # clear file from os
    os.remove(tmp_path / "test_save.zip")
Ejemplo n.º 4
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])

    if args.stats_path is None:
        envs = VecNormalize(envs,
                            norm_obs=True,
                            clip_obs=np.inf,
                            norm_reward=False,
                            clip_reward=np.inf)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    print("Do random explorations to build running averages")
    envs.reset()
    for _ in tqdm(range(1000)):
        random_action = np.stack(
            [envs.action_space.sample() for _ in range(n_envs)])
        envs.step(random_action)
    envs.training = False  # freeze the running averages (what a terrible variable name...)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs, device=args.device)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)

        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs,
                      device=args.device,
                      target_kl=2e-2)
        if args.device == 'cpu':
            torch.cuda.empty_cache()
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Ejemplo n.º 5
0
    def run_ensemble_strategy(self, A2C_model_kwargs, PPO_model_kwargs,
                              DDPG_model_kwargs, timesteps_dict):
        """Ensemble Strategy that combines PPO, A2C and DDPG"""
        print("============Start Ensemble Strategy============")
        # for ensemble model, it's necessary to feed the last state
        # of the previous model to the current model as the initial state
        last_state_ensemble = []

        ppo_sharpe_list = []
        ddpg_sharpe_list = []
        a2c_sharpe_list = []

        model_use = []
        validation_start_date_list = []
        validation_end_date_list = []
        iteration_list = []

        insample_turbulence = self.df[(self.df.date < self.train_period[1])
                                      & (self.df.date >= self.train_period[0])]
        insample_turbulence_threshold = np.quantile(
            insample_turbulence.turbulence.values, .90)

        start = time.time()
        for i in range(self.rebalance_window + self.validation_window,
                       len(self.unique_trade_date), self.rebalance_window):
            validation_start_date = self.unique_trade_date[
                i - self.rebalance_window - self.validation_window]
            validation_end_date = self.unique_trade_date[i -
                                                         self.rebalance_window]

            validation_start_date_list.append(validation_start_date)
            validation_end_date_list.append(validation_end_date)
            iteration_list.append(i)

            print("============================================")
            ## initial state is empty
            if i - self.rebalance_window - self.validation_window == 0:
                # inital state
                initial = True
            else:
                # previous state
                initial = False

            # Tuning trubulence index based on historical data
            # Turbulence lookback window is one quarter (63 days)
            end_date_index = self.df.index[
                self.df["date"] == self.unique_trade_date[
                    i - self.rebalance_window -
                    self.validation_window]].to_list()[-1]
            start_date_index = end_date_index - 63 + 1

            historical_turbulence = self.df.iloc[start_date_index:(
                end_date_index + 1), :]

            historical_turbulence = historical_turbulence.drop_duplicates(
                subset=['date'])

            historical_turbulence_mean = np.mean(
                historical_turbulence.turbulence.values)

            print(historical_turbulence_mean)

            if historical_turbulence_mean > insample_turbulence_threshold:
                # if the mean of the historical data is greater than the 90% quantile of insample turbulence data
                # then we assume that the current market is volatile,
                # therefore we set the 90% quantile of insample turbulence data as the turbulence threshold
                # meaning the current turbulence can't exceed the 90% quantile of insample turbulence data
                turbulence_threshold = insample_turbulence_threshold
            else:
                # if the mean of the historical data is less than the 90% quantile of insample turbulence data
                # then we tune up the turbulence_threshold, meaning we lower the risk
                turbulence_threshold = np.quantile(
                    insample_turbulence.turbulence.values, 1)
            print("turbulence_threshold: ", turbulence_threshold)

            ############## Environment Setup starts ##############
            ## training env
            train = data_split(
                self.df,
                start=self.train_period[0],
                end=self.unique_trade_date[i - self.rebalance_window -
                                           self.validation_window])
            self.train_env = DummyVecEnv([
                lambda: StockTradingEnv(train,
                                        self.stock_dim,
                                        self.hmax,
                                        self.initial_amount,
                                        self.buy_cost_pct,
                                        self.sell_cost_pct,
                                        self.reward_scaling,
                                        self.state_space,
                                        self.action_space,
                                        self.tech_indicator_list,
                                        print_verbosity=self.print_verbosity)
            ])

            validation = data_split(
                self.df,
                start=self.unique_trade_date[i - self.rebalance_window -
                                             self.validation_window],
                end=self.unique_trade_date[i - self.rebalance_window])
            ############## Environment Setup ends ##############

            ############## Training and Validation starts ##############
            print(
                "======Model training from: ", self.train_period[0], "to ",
                self.unique_trade_date[i - self.rebalance_window -
                                       self.validation_window])
            # print("training: ",len(data_split(df, start=20090000, end=test.datadate.unique()[i-rebalance_window]) ))
            # print("==============Model Training===========")
            print("======A2C Training========")
            model_a2c = self.get_model("a2c",
                                       self.train_env,
                                       policy="MlpPolicy",
                                       model_kwargs=A2C_model_kwargs)
            model_a2c = self.train_model(
                model_a2c,
                "a2c",
                tb_log_name="a2c_{}".format(i),
                iter_num=i,
                total_timesteps=timesteps_dict['a2c'])  #100_000

            print("======A2C Validation from: ", validation_start_date, "to ",
                  validation_end_date)
            val_env_a2c = DummyVecEnv([
                lambda: StockTradingEnv(validation,
                                        self.stock_dim,
                                        self.hmax,
                                        self.initial_amount,
                                        self.buy_cost_pct,
                                        self.sell_cost_pct,
                                        self.reward_scaling,
                                        self.state_space,
                                        self.action_space,
                                        self.tech_indicator_list,
                                        turbulence_threshold=
                                        turbulence_threshold,
                                        iteration=i,
                                        model_name='A2C',
                                        mode='validation',
                                        print_verbosity=self.print_verbosity)
            ])
            val_obs_a2c = val_env_a2c.reset()
            self.DRL_validation(model=model_a2c,
                                test_data=validation,
                                test_env=val_env_a2c,
                                test_obs=val_obs_a2c)
            sharpe_a2c = self.get_validation_sharpe(i, model_name="A2C")
            print("A2C Sharpe Ratio: ", sharpe_a2c)

            print("======PPO Training========")
            model_ppo = self.get_model("ppo",
                                       self.train_env,
                                       policy="MlpPolicy",
                                       model_kwargs=PPO_model_kwargs)
            model_ppo = self.train_model(
                model_ppo,
                "ppo",
                tb_log_name="ppo_{}".format(i),
                iter_num=i,
                total_timesteps=timesteps_dict['ppo'])  #100_000
            print("======PPO Validation from: ", validation_start_date, "to ",
                  validation_end_date)
            val_env_ppo = DummyVecEnv([
                lambda: StockTradingEnv(validation,
                                        self.stock_dim,
                                        self.hmax,
                                        self.initial_amount,
                                        self.buy_cost_pct,
                                        self.sell_cost_pct,
                                        self.reward_scaling,
                                        self.state_space,
                                        self.action_space,
                                        self.tech_indicator_list,
                                        turbulence_threshold=
                                        turbulence_threshold,
                                        iteration=i,
                                        model_name='PPO',
                                        mode='validation',
                                        print_verbosity=self.print_verbosity)
            ])
            val_obs_ppo = val_env_ppo.reset()
            self.DRL_validation(model=model_ppo,
                                test_data=validation,
                                test_env=val_env_ppo,
                                test_obs=val_obs_ppo)
            sharpe_ppo = self.get_validation_sharpe(i, model_name="PPO")
            print("PPO Sharpe Ratio: ", sharpe_ppo)

            print("======DDPG Training========")
            model_ddpg = self.get_model("ddpg",
                                        self.train_env,
                                        policy="MlpPolicy",
                                        model_kwargs=DDPG_model_kwargs)
            model_ddpg = self.train_model(
                model_ddpg,
                "ddpg",
                tb_log_name="ddpg_{}".format(i),
                iter_num=i,
                total_timesteps=timesteps_dict['ddpg'])  #50_000
            print("======DDPG Validation from: ", validation_start_date, "to ",
                  validation_end_date)
            val_env_ddpg = DummyVecEnv([
                lambda: StockTradingEnv(validation,
                                        self.stock_dim,
                                        self.hmax,
                                        self.initial_amount,
                                        self.buy_cost_pct,
                                        self.sell_cost_pct,
                                        self.reward_scaling,
                                        self.state_space,
                                        self.action_space,
                                        self.tech_indicator_list,
                                        turbulence_threshold=
                                        turbulence_threshold,
                                        iteration=i,
                                        model_name='DDPG',
                                        mode='validation',
                                        print_verbosity=self.print_verbosity)
            ])
            val_obs_ddpg = val_env_ddpg.reset()
            self.DRL_validation(model=model_ddpg,
                                test_data=validation,
                                test_env=val_env_ddpg,
                                test_obs=val_obs_ddpg)
            sharpe_ddpg = self.get_validation_sharpe(i, model_name="DDPG")

            ppo_sharpe_list.append(sharpe_ppo)
            a2c_sharpe_list.append(sharpe_a2c)
            ddpg_sharpe_list.append(sharpe_ddpg)

            print("======Best Model Retraining from: ", self.train_period[0],
                  "to ", self.unique_trade_date[i - self.rebalance_window])
            # Environment setup for model retraining up to first trade date
            train_full = data_split(
                self.df,
                start=self.train_period[0],
                end=self.unique_trade_date[i - self.rebalance_window])
            self.train_full_env = DummyVecEnv([
                lambda: StockTradingEnv(train_full,
                                        self.stock_dim,
                                        self.hmax,
                                        self.initial_amount,
                                        self.buy_cost_pct,
                                        self.sell_cost_pct,
                                        self.reward_scaling,
                                        self.state_space,
                                        self.action_space,
                                        self.tech_indicator_list,
                                        print_verbosity=self.print_verbosity)
            ])
            # Model Selection based on sharpe ratio
            if (sharpe_ppo >= sharpe_a2c) & (sharpe_ppo >= sharpe_ddpg):
                model_use.append('PPO')

                model_ensemble = self.get_model("ppo",
                                                self.train_full_env,
                                                policy="MlpPolicy",
                                                model_kwargs=PPO_model_kwargs)
                model_ensemble = self.train_model(
                    model_ensemble,
                    "ensemble",
                    tb_log_name="ensemble_{}".format(i),
                    iter_num=i,
                    total_timesteps=timesteps_dict['ppo'])  #100_000
            elif (sharpe_a2c > sharpe_ppo) & (sharpe_a2c > sharpe_ddpg):
                model_use.append('A2C')

                model_ensemble = self.get_model("a2c",
                                                self.train_full_env,
                                                policy="MlpPolicy",
                                                model_kwargs=A2C_model_kwargs)
                model_ensemble = self.train_model(
                    model_ensemble,
                    "ensemble",
                    tb_log_name="ensemble_{}".format(i),
                    iter_num=i,
                    total_timesteps=timesteps_dict['a2c'])  #100_000
            else:
                model_use.append('DDPG')

                model_ensemble = self.get_model("ddpg",
                                                self.train_full_env,
                                                policy="MlpPolicy",
                                                model_kwargs=DDPG_model_kwargs)
                model_ensemble = self.train_model(
                    model_ensemble,
                    "ensemble",
                    tb_log_name="ensemble_{}".format(i),
                    iter_num=i,
                    total_timesteps=timesteps_dict['ddpg'])  #50_000

            ############## Training and Validation ends ##############

            ############## Trading starts ##############
            print("======Trading from: ",
                  self.unique_trade_date[i - self.rebalance_window], "to ",
                  self.unique_trade_date[i])
            #print("Used Model: ", model_ensemble)
            last_state_ensemble = self.DRL_prediction(
                model=model_ensemble,
                name="ensemble",
                last_state=last_state_ensemble,
                iter_num=i,
                turbulence_threshold=turbulence_threshold,
                initial=initial)
            ############## Trading ends ##############

        end = time.time()
        print("Ensemble Strategy took: ", (end - start) / 60, " minutes")

        df_summary = pd.DataFrame([
            iteration_list, validation_start_date_list,
            validation_end_date_list, model_use, a2c_sharpe_list,
            ppo_sharpe_list, ddpg_sharpe_list
        ]).T
        df_summary.columns = [
            'Iter', 'Val Start', 'Val End', 'Model Used', 'A2C Sharpe',
            'PPO Sharpe', 'DDPG Sharpe'
        ]

        return df_summary
Ejemplo n.º 6
0
                    seed=args.seed,
                    tensorboard_log=args.tensorboard)
    #--------------------------------------------------------#

    #-------------------------ERROR?-------------------------#
    else:
        raise RuntimeError('Algorithm specified is not registered.')
    #--------------------------------------------------------#

    # Calculating n_timesteps_episode for training
    n_timesteps_episode = env.simulator._eplus_one_epi_len / \
        env.simulator._eplus_run_stepsize
    timesteps = args.episodes * n_timesteps_episode

    # For callbacks processing
    env_vec = DummyVecEnv([lambda: env])

    # Using Callbacks for training
    callbacks = []

    # Set up Evaluation and saving best model
    if args.evaluation:
        eval_callback = LoggerEvalCallback(
            env_vec,
            best_model_save_path='best_model/' + name + '/',
            log_path='best_model/' + name + '/',
            eval_freq=n_timesteps_episode * args.eval_freq,
            deterministic=True,
            render=False,
            n_eval_episodes=args.eval_length)
        callbacks.append(eval_callback)
Ejemplo n.º 7
0
def act(
    flags,
    actor_index: int,
    free_queue: mp.SimpleQueue,
    full_queue: mp.SimpleQueue,
    model: torch.nn.Module,
    buffers: Buffers,
    initial_agent_state_buffers,
):
    try:
        logging.info("Actor %i started.", actor_index)
        timings = Timings()  # Keep track of how fast things are.

        gym_env = create_env(flags)
        seed = actor_index ^ int.from_bytes(os.urandom(4), byteorder="little")
        gym_env.seed(seed)
        env = Environment(gym_env)

        def make_env(flags):
            def thunk():
                env = create_env(flags)
                return env

            return thunk

        envs = DummyVecEnv([make_env(flags) for i in range(1)])

        env_output = env.initial()
        envs.reset()
        agent_state = model.initial_state(batch_size=1)
        agent_output, unused_state = model(env_output, agent_state)
        while True:
            index = free_queue.get()
            if index is None:
                break

            # Write old rollout end.
            for key in env_output:
                buffers[key][index][0, ...] = env_output[key]
            for key in agent_output:
                buffers[key][index][0, ...] = agent_output[key]
            for i, tensor in enumerate(agent_state):
                initial_agent_state_buffers[index][i][...] = tensor

            # Do new rollout.
            for t in range(flags.unroll_length):
                timings.reset()

                with torch.no_grad():
                    agent_output, agent_state = model(env_output, agent_state)

                # timings.time("model")

                env_output = env.step(agent_output["action"])
                # env_output = env.step(agent_output["action"])
                # envs.step((torch.randint(0, envs.action_space.n, (envs.num_envs,))).numpy())
                assert agent_output["action"] == env_output["last_action"]
                timings.time("step")

                for key in env_output:
                    buffers[key][index][t + 1, ...] = env_output[key]
                for key in agent_output:
                    buffers[key][index][t + 1, ...] = agent_output[key]

                timings.time("write")
            full_queue.put(index)

        if actor_index == 0:
            logging.info("Actor %i: %s", actor_index, timings.summary())

    except KeyboardInterrupt:
        pass  # Return silently.
    except Exception as e:
        logging.error("Exception in worker process %i", actor_index)
        traceback.print_exc()
        print()
        raise e
Ejemplo n.º 8
0
    def __init__(self,
                 policy: Type[BasePolicy],
                 env: Union[GymEnv, str],
                 policy_base: Type[BasePolicy],
                 learning_rate: Union[float, Callable],
                 policy_kwargs: Dict[str, Any] = None,
                 verbose: int = 0,
                 device: Union[th.device, str] = 'auto',
                 support_multi_env: bool = False,
                 create_eval_env: bool = False,
                 monitor_wrapper: bool = True,
                 seed: Optional[int] = None,
                 use_sde: bool = False,
                 sde_sample_freq: int = -1):

        if isinstance(policy, str) and policy_base is not None:
            self.policy_class = get_policy_from_name(policy_base, policy)
        else:
            self.policy_class = policy

        self.device = get_device(device)
        if verbose > 0:
            print(f"Using {self.device} device")

        self.env = None  # type: Optional[GymEnv]
        # get VecNormalize object if needed
        self._vec_normalize_env = unwrap_vec_normalize(env)
        self.verbose = verbose
        self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs
        self.observation_space = None  # type: Optional[gym.spaces.Space]
        self.action_space = None  # type: Optional[gym.spaces.Space]
        self.n_envs = None
        self.num_timesteps = 0
        self.eval_env = None
        self.seed = seed
        self.action_noise = None  # type: Optional[ActionNoise]
        self.start_time = None
        self.policy = None
        self.learning_rate = learning_rate
        self.lr_schedule = None  # type: Optional[Callable]
        self._last_obs = None  # type: Optional[np.ndarray]
        # When using VecNormalize:
        self._last_original_obs = None  # type: Optional[np.ndarray]
        self._episode_num = 0
        # Used for gSDE only
        self.use_sde = use_sde
        self.sde_sample_freq = sde_sample_freq
        # Track the training progress (from 1 to 0)
        # this is used to update the learning rate
        self._current_progress = 1
        # Buffers for logging
        self.ep_info_buffer = None  # type: Optional[deque]
        self.ep_success_buffer = None  # type: Optional[deque]
        # For logging
        self._n_updates = 0  # type: int

        # Create and wrap the env if needed
        if env is not None:
            if isinstance(env, str):
                if create_eval_env:
                    eval_env = gym.make(env)
                    if monitor_wrapper:
                        eval_env = Monitor(eval_env, filename=None)
                    self.eval_env = DummyVecEnv([lambda: eval_env])
                if self.verbose >= 1:
                    print(
                        "Creating environment from the given name, wrapped in a DummyVecEnv."
                    )

                env = gym.make(env)
                if monitor_wrapper:
                    env = Monitor(env, filename=None)
                env = DummyVecEnv([lambda: env])

            env = self._wrap_env(env)

            self.observation_space = env.observation_space
            self.action_space = env.action_space
            self.n_envs = env.num_envs
            self.env = env

            if not support_multi_env and self.n_envs > 1:
                raise ValueError(
                    "Error: the model does not support multiple envs requires a single vectorized"
                    " environment.")
Ejemplo n.º 9
0
def test_sync_vec_normalize(make_env):
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    if not isinstance(env.observation_space, spaces.Dict):
        env = VecFrameStack(env, 1)
        assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0)

    if not isinstance(env.observation_space, spaces.Dict):
        eval_env = VecFrameStack(eval_env, 1)

    env.seed(0)
    env.action_space.seed(0)

    env.reset()
    # Initialize running mean
    latest_reward = None
    for _ in range(100):
        _, latest_reward, _, _ = env.step([env.action_space.sample()])

    # Check that unnormalized reward is same as original reward
    original_latest_reward = env.get_original_reward()
    assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward))

    obs = env.reset()
    dummy_rewards = np.random.rand(10)
    original_obs = env.get_original_obs()
    # Check that unnormalization works
    assert allclose(original_obs, env.unnormalize_obs(obs))
    # Normalization must be different (between different environments)
    assert not allclose(obs, eval_env.normalize_obs(original_obs))

    # Test syncing of parameters
    sync_envs_normalization(env, eval_env)
    # Now they must be synced
    assert allclose(obs, eval_env.normalize_obs(original_obs))
    assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
Ejemplo n.º 10
0
def test(seed,
         model_filename,
         vec_filename,
         train,
         test,
         test_as_class=0,
         render=False,
         save_file="default.yml"):
    global g_step, g_fMRI_data
    print("Testing:")
    total_rewards = []
    distance_xs = []
    if True:
        g_step = 0
        g_fMRI_data = np.zeros(shape=[args.test_steps, 256], dtype=np.float32)

        print(f" Seed {seed}, model {model_filename} vec {vec_filename}")
        print(
            f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}")
        if test_as_class >= 0:
            bodyinfo = test_as_class
        else:
            if args.with_bodyinfo:
                bodyinfo = test // 100
            else:
                bodyinfo = 0
        eval_env = utils.make_env(render=render,
                                  robot_body=test,
                                  body_info=bodyinfo)
        eval_env = DummyVecEnv([eval_env])
        if args.vec_normalize:
            eval_env = VecNormalize.load(vec_filename, eval_env)
        eval_env.norm_reward = False

        eval_env.seed(seed)
        model = PPO.load(model_filename)

        obs = eval_env.reset()
        if render:
            # eval_env.env_method("set_view")
            print(
                "\n\nWait for a while, so I have the time to press Ctrl+F11 to enter FullScreen Mode.\n\n"
            )
            time.sleep(
                3
            )  # Wait for a while, so I have the time to press Ctrl+F11 to enter FullScreen Mode.
        distance_x = 0
        # print(obs)
        total_reward = 0
        for step in tqdm(range(args.test_steps)):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_env.step(action)
            if render:
                eval_env.envs[0].camera_adjust()
                (width, height, rgbPixels, _,
                 _) = eval_env.envs[0].env.env._p.getCameraImage(
                     1920, 1080, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL)
                image = rgbPixels[:, :, :3]
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                cv2.imwrite(
                    f"{folder}/fMRI_videos/getCameraImage_b{test}_s{seed}_{step:05}.png",
                    image)
            if done:
                # it should not matter if the env reset. I guess...
                # break
                pass
            else:  # the last observation will be after reset, so skip the last
                distance_x = eval_env.envs[0].robot.body_xyz[0]
            total_reward += reward[0]
            # if render:
            #    time.sleep(0.01)

        eval_env.close()
        print(
            f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}"
        )

        if args.save_fmri:
            base_fMRI_data = None
            sorted_data = g_fMRI_data.copy()
            if test != 0 or seed != 0:
                # if sorted_arg exists, use the existing one
                # because we want to compare the patterns of two experiments
                sorted_arg = np.load(f"{folder}/sorted_arg.npy")
                base_fMRI_data = np.load(f"{folder}/base_fMRI_data.npy")
            else:
                sorted_arg = np.argsort(np.mean(sorted_data, axis=0))
                np.save(f"{folder}/sorted_arg.npy", sorted_arg)
                base_fMRI_data = g_fMRI_data.copy()
                np.save(f"{folder}/base_fMRI_data.npy", base_fMRI_data)

            sorted_data = sorted_data[:, sorted_arg]
            base_fMRI_data = base_fMRI_data[:, sorted_arg]

            for step in tqdm(range(args.test_steps)):
                plt.close()
                plt.figure(figsize=[10, 4])
                if test != 0 or seed != 0:
                    x = sorted_data[step]
                    plt.bar(np.arange(len(x)), x, color=[0.4, 0.7, 0.9, 0.5])
                x = base_fMRI_data[step]
                plt.bar(np.arange(len(x)), x, color=[0.3, 0.3, 0.3, 0.5])
                plt.savefig(
                    f"{folder}/fMRI_videos/barchart_b{test}_s{seed}_{step:05}.png"
                )
                plt.close()

        total_rewards.append(total_reward)
        distance_xs.append(distance_x)

    # avoid yaml turn float64 to numpy array
    total_rewards = [float(x) for x in total_rewards]
    distance_xs = [float(x) for x in distance_xs]

    data = {
        "title": "test",
        "train": train,
        "test": test,
        "total_reward": total_rewards,
        "distance_x": distance_xs,
    }
    with open(f"{save_file}", "w") as f:
        yaml.dump(data, f)
Ejemplo n.º 11
0
class BaseRLModel(ABC):
    """
    The base RL model

    :param policy: (Type[BasePolicy]) Policy object
    :param env: (Union[GymEnv, str]) The environment to learn from
                (if registered in Gym, can be str. Can be None for loading trained models)
    :param policy_base: (Type[BasePolicy]) The base policy used by this method
    :param learning_rate: (float or callable) learning rate for the optimizer,
        it can be a function of the current progress (from 1 to 0)
    :param policy_kwargs: (Dict[str, Any]) Additional arguments to be passed to the policy on creation
    :param verbose: (int) The verbosity level: 0 none, 1 training information, 2 debug
    :param device: (Union[th.device, str]) Device on which the code should run.
        By default, it will try to use a Cuda compatible device and fallback to cpu
        if it is not possible.
    :param support_multi_env: (bool) Whether the algorithm supports training
        with multiple environments (as in A2C)
    :param create_eval_env: (bool) Whether to create a second environment that will be
        used for evaluating the agent periodically. (Only available when passing string for the environment)
    :param monitor_wrapper: (bool) When creating an environment, whether to wrap it
        or not in a Monitor wrapper.
    :param seed: (Optional[int]) Seed for the pseudo random generators
    :param use_sde: (bool) Whether to use generalized State Dependent Exploration (gSDE)
        instead of action noise exploration (default: False)
    :param sde_sample_freq: (int) Sample a new noise matrix every n steps when using gSDE
        Default: -1 (only sample at the beginning of the rollout)
    """
    def __init__(self,
                 policy: Type[BasePolicy],
                 env: Union[GymEnv, str],
                 policy_base: Type[BasePolicy],
                 learning_rate: Union[float, Callable],
                 policy_kwargs: Dict[str, Any] = None,
                 verbose: int = 0,
                 device: Union[th.device, str] = 'auto',
                 support_multi_env: bool = False,
                 create_eval_env: bool = False,
                 monitor_wrapper: bool = True,
                 seed: Optional[int] = None,
                 use_sde: bool = False,
                 sde_sample_freq: int = -1):

        if isinstance(policy, str) and policy_base is not None:
            self.policy_class = get_policy_from_name(policy_base, policy)
        else:
            self.policy_class = policy

        self.device = get_device(device)
        if verbose > 0:
            print(f"Using {self.device} device")

        self.env = None  # type: Optional[GymEnv]
        # get VecNormalize object if needed
        self._vec_normalize_env = unwrap_vec_normalize(env)
        self.verbose = verbose
        self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs
        self.observation_space = None  # type: Optional[gym.spaces.Space]
        self.action_space = None  # type: Optional[gym.spaces.Space]
        self.n_envs = None
        self.num_timesteps = 0
        self.eval_env = None
        self.seed = seed
        self.action_noise = None  # type: Optional[ActionNoise]
        self.start_time = None
        self.policy = None
        self.learning_rate = learning_rate
        self.lr_schedule = None  # type: Optional[Callable]
        self._last_obs = None  # type: Optional[np.ndarray]
        # When using VecNormalize:
        self._last_original_obs = None  # type: Optional[np.ndarray]
        self._episode_num = 0
        # Used for gSDE only
        self.use_sde = use_sde
        self.sde_sample_freq = sde_sample_freq
        # Track the training progress (from 1 to 0)
        # this is used to update the learning rate
        self._current_progress = 1
        # Buffers for logging
        self.ep_info_buffer = None  # type: Optional[deque]
        self.ep_success_buffer = None  # type: Optional[deque]
        # For logging
        self._n_updates = 0  # type: int

        # Create and wrap the env if needed
        if env is not None:
            if isinstance(env, str):
                if create_eval_env:
                    eval_env = gym.make(env)
                    if monitor_wrapper:
                        eval_env = Monitor(eval_env, filename=None)
                    self.eval_env = DummyVecEnv([lambda: eval_env])
                if self.verbose >= 1:
                    print(
                        "Creating environment from the given name, wrapped in a DummyVecEnv."
                    )

                env = gym.make(env)
                if monitor_wrapper:
                    env = Monitor(env, filename=None)
                env = DummyVecEnv([lambda: env])

            env = self._wrap_env(env)

            self.observation_space = env.observation_space
            self.action_space = env.action_space
            self.n_envs = env.num_envs
            self.env = env

            if not support_multi_env and self.n_envs > 1:
                raise ValueError(
                    "Error: the model does not support multiple envs requires a single vectorized"
                    " environment.")

    def _wrap_env(self, env: GymEnv) -> VecEnv:
        if not isinstance(env, VecEnv):
            if self.verbose >= 1:
                print("Wrapping the env in a DummyVecEnv.")
            env = DummyVecEnv([lambda: env])

        if is_image_space(env.observation_space) and not isinstance(
                env, VecTransposeImage):
            if self.verbose >= 1:
                print("Wrapping the env in a VecTransposeImage.")
            env = VecTransposeImage(env)
        return env

    @abstractmethod
    def _setup_model(self) -> None:
        """
        Create networks, buffer and optimizers
        """
        raise NotImplementedError()

    def _get_eval_env(self, eval_env: Optional[GymEnv]) -> Optional[GymEnv]:
        """
        Return the environment that will be used for evaluation.

        :param eval_env: (Optional[GymEnv]))
        :return: (Optional[GymEnv])
        """
        if eval_env is None:
            eval_env = self.eval_env

        if eval_env is not None:
            eval_env = self._wrap_env(eval_env)
            assert eval_env.num_envs == 1
        return eval_env

    def _setup_lr_schedule(self) -> None:
        """Transform to callable if needed."""
        self.lr_schedule = get_schedule_fn(self.learning_rate)

    def _update_current_progress(self, num_timesteps: int,
                                 total_timesteps: int) -> None:
        """
        Compute current progress (from 1 to 0)

        :param num_timesteps: current number of timesteps
        :param total_timesteps:
        """
        self._current_progress = 1.0 - float(num_timesteps) / float(
            total_timesteps)

    def _update_learning_rate(
        self, optimizers: Union[List[th.optim.Optimizer],
                                th.optim.Optimizer]) -> None:
        """
        Update the optimizers learning rate using the current learning rate schedule
        and the current progress (from 1 to 0).

        :param optimizers: (Union[List[th.optim.Optimizer], th.optim.Optimizer])
            An optimizer or a list of optimizers.
        """
        # Log the current learning rate
        logger.logkv("learning_rate", self.lr_schedule(self._current_progress))

        if not isinstance(optimizers, list):
            optimizers = [optimizers]
        for optimizer in optimizers:
            update_learning_rate(optimizer,
                                 self.lr_schedule(self._current_progress))

    @staticmethod
    def safe_mean(arr: Union[np.ndarray, list, deque]) -> np.ndarray:
        """
        Compute the mean of an array if there is at least one element.
        For empty array, return NaN. It is used for logging only.

        :param arr:
        :return:
        """
        return np.nan if len(arr) == 0 else np.mean(arr)

    def get_env(self) -> Optional[VecEnv]:
        """
        Returns the current environment (can be None if not defined).

        :return: (Optional[VecEnv]) The current environment
        """
        return self.env

    def get_vec_normalize_env(self) -> Optional[VecNormalize]:
        """
        Return the ``VecNormalize`` wrapper of the training env
        if it exists.
        :return: Optional[VecNormalize] The ``VecNormalize`` env.
        """
        return self._vec_normalize_env

    @staticmethod
    def check_env(env: GymEnv, observation_space: gym.spaces.Space,
                  action_space: gym.spaces.Space):
        """
        Checks the validity of the environment to load vs the one used for training.
        Checked parameters:
        - observation_space
        - action_space

        :param env: (GymEnv)
        :param observation_space: (gym.spaces.Space)
        :param action_space: (gym.spaces.Space)
        """
        if (observation_space != env.observation_space
                # Special cases for images that need to be transposed
                and
                not (is_image_space(env.observation_space) and
                     observation_space == VecTransposeImage.transpose_space(
                         env.observation_space))):
            raise ValueError(
                f'Observation spaces do not match: {observation_space} != {env.observation_space}'
            )
        if action_space != env.action_space:
            raise ValueError(
                f'Action spaces do not match: {action_space} != {env.action_space}'
            )

    def set_env(self, env: GymEnv) -> None:
        """
        Checks the validity of the environment, and if it is coherent, set it as the current environment.
        Furthermore wrap any non vectorized env into a vectorized
        checked parameters:
        - observation_space
        - action_space

        :param env: The environment for learning a policy
        """
        self.check_env(env, self.observation_space, self.action_space)
        # it must be coherent now
        # if it is not a VecEnv, make it a VecEnv
        env = self._wrap_env(env)

        self.n_envs = env.num_envs
        self.env = env

    def get_torch_variables(self) -> Tuple[List[str], List[str]]:
        """
        Get the name of the torch variable that will be saved.
        ``th.save`` and ``th.load`` will be used with the right device
        instead of the default pickling strategy.

        :return: (Tuple[List[str], List[str]])
            name of the variables with state dicts to save, name of additional torch tensors,
        """
        state_dicts = ["policy"]

        return state_dicts, []

    @abstractmethod
    def learn(self,
              total_timesteps: int,
              callback: MaybeCallback = None,
              log_interval: int = 100,
              tb_log_name: str = "run",
              eval_env: Optional[GymEnv] = None,
              eval_freq: int = -1,
              n_eval_episodes: int = 5,
              eval_log_path: Optional[str] = None,
              reset_num_timesteps: bool = True) -> 'BaseRLModel':
        """
        Return a trained model.

        :param total_timesteps: (int) The total number of samples to train on
        :param callback: (function (dict, dict)) -> boolean function called at every steps with state of the algorithm.
            It takes the local and global variables. If it returns False, training is aborted.
        :param log_interval: (int) The number of timesteps before logging.
        :param tb_log_name: (str) the name of the run for tensorboard log
        :param reset_num_timesteps: (bool) whether or not to reset the current timestep number (used in logging)
        :param eval_env: (gym.Env) Environment that will be used to evaluate the agent
        :param eval_freq: (int) Evaluate the agent every ``eval_freq`` timesteps (this may vary a little)
        :param n_eval_episodes: (int) Number of episode to evaluate the agent
        :param eval_log_path: (Optional[str]) Path to a folder where the evaluations will be saved
        :param reset_num_timesteps: (bool)
        :return: (BaseRLModel) the trained model
        """
        raise NotImplementedError()

    def predict(
        self,
        observation: np.ndarray,
        state: Optional[np.ndarray] = None,
        mask: Optional[np.ndarray] = None,
        deterministic: bool = False
    ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
        """
        Get the model's action(s) from an observation

        :param observation: (np.ndarray) the input observation
        :param state: (Optional[np.ndarray]) The last states (can be None, used in recurrent policies)
        :param mask: (Optional[np.ndarray]) The last masks (can be None, used in recurrent policies)
        :param deterministic: (bool) Whether or not to return deterministic actions.
        :return: (Tuple[np.ndarray, Optional[np.ndarray]]) the model's action and the next state
            (used in recurrent policies)
        """
        return self.policy.predict(observation, state, mask, deterministic)

    @classmethod
    def load(cls, load_path: str, env: Optional[GymEnv] = None, **kwargs):
        """
        Load the model from a zip-file

        :param load_path: the location of the saved data
        :param env: the new environment to run the loaded model on
            (can be None if you only need prediction from a trained model) has priority over any saved environment
        :param kwargs: extra arguments to change the model when loading
        """
        data, params, tensors = cls._load_from_file(load_path)

        if 'policy_kwargs' in data:
            for arg_to_remove in ['device']:
                if arg_to_remove in data['policy_kwargs']:
                    del data['policy_kwargs'][arg_to_remove]

        if 'policy_kwargs' in kwargs and kwargs['policy_kwargs'] != data[
                'policy_kwargs']:
            raise ValueError(
                f"The specified policy kwargs do not equal the stored policy kwargs."
                f"Stored kwargs: {data['policy_kwargs']}, specified kwargs: {kwargs['policy_kwargs']}"
            )

        # check if observation space and action space are part of the saved parameters
        if ("observation_space" not in data
                or "action_space" not in data) and "env" not in data:
            raise ValueError(
                "The observation_space and action_space was not given, can't verify new environments"
            )
        # check if given env is valid
        if env is not None:
            cls.check_env(env, data["observation_space"], data["action_space"])
        # if no new env was given use stored env if possible
        if env is None and "env" in data:
            env = data["env"]

        # noinspection PyArgumentList
        model = cls(policy=data["policy_class"],
                    env=env,
                    device='auto',
                    _init_setup_model=False)

        # load parameters
        model.__dict__.update(data)
        model.__dict__.update(kwargs)
        if not hasattr(model, "_setup_model") and len(params) > 0:
            raise NotImplementedError(
                f"{cls} has no ``_setup_model()`` method")
        model._setup_model()

        # put state_dicts back in place
        for name in params:
            attr = recursive_getattr(model, name)
            attr.load_state_dict(params[name])

        # put tensors back in place
        if tensors is not None:
            for name in tensors:
                recursive_setattr(model, name, tensors[name])

        return model

    @staticmethod
    def _load_from_file(
        load_path: str,
        load_data: bool = True
    ) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict],
                Optional[TensorDict]]):
        """ Load model data from a .zip archive

        :param load_path: Where to load the model from
        :param load_data: Whether we should load and return data
            (class parameters). Mainly used by 'load_parameters' to only load model parameters (weights)
        :return: (dict),(dict),(dict) Class parameters, model state_dicts (dict of state_dict)
            and dict of extra tensors
        """
        # Check if file exists if load_path is a string
        if isinstance(load_path, str):
            if not os.path.exists(load_path):
                if os.path.exists(load_path + ".zip"):
                    load_path += ".zip"
                else:
                    raise ValueError(
                        f"Error: the file {load_path} could not be found")

        # set device to cpu if cuda is not available
        device = get_device()

        # Open the zip archive and load data
        try:
            with zipfile.ZipFile(load_path, "r") as archive:
                namelist = archive.namelist()
                # If data or parameters is not in the
                # zip archive, assume they were stored
                # as None (_save_to_file_zip allows this).
                data = None
                tensors = None
                params = {}

                if "data" in namelist and load_data:
                    # Load class parameters and convert to string
                    json_data = archive.read("data").decode()
                    data = json_to_data(json_data)

                if "tensors.pth" in namelist and load_data:
                    # Load extra tensors
                    with archive.open('tensors.pth', mode="r") as tensor_file:
                        # File has to be seekable, but opt_param_file is not, so load in BytesIO first
                        # fixed in python >= 3.7
                        file_content = io.BytesIO()
                        file_content.write(tensor_file.read())
                        # go to start of file
                        file_content.seek(0)
                        # load the parameters with the right ``map_location``
                        tensors = th.load(file_content, map_location=device)

                # check for all other .pth files
                other_files = [
                    file_name for file_name in namelist
                    if os.path.splitext(file_name)[1] == ".pth"
                    and file_name != "tensors.pth"
                ]
                # if there are any other files which end with .pth and aren't "params.pth"
                # assume that they each are optimizer parameters
                if len(other_files) > 0:
                    for file_path in other_files:
                        with archive.open(file_path,
                                          mode="r") as opt_param_file:
                            # File has to be seekable, but opt_param_file is not, so load in BytesIO first
                            # fixed in python >= 3.7
                            file_content = io.BytesIO()
                            file_content.write(opt_param_file.read())
                            # go to start of file
                            file_content.seek(0)
                            # load the parameters with the right ``map_location``
                            params[os.path.splitext(file_path)[0]] = th.load(
                                file_content, map_location=device)

        except zipfile.BadZipFile:
            # load_path wasn't a zip file
            raise ValueError(f"Error: the file {load_path} wasn't a zip-file")
        return data, params, tensors

    def set_random_seed(self, seed: Optional[int] = None) -> None:
        """
        Set the seed of the pseudo-random generators
        (python, numpy, pytorch, gym, action_space)

        :param seed: (int)
        """
        if seed is None:
            return
        set_random_seed(seed, using_cuda=self.device == th.device('cuda'))
        self.action_space.seed(seed)
        if self.env is not None:
            self.env.seed(seed)
        if self.eval_env is not None:
            self.eval_env.seed(seed)

    def _init_callback(self,
                       callback: Union[None, Callable, List[BaseCallback],
                                       BaseCallback],
                       eval_env: Optional[VecEnv] = None,
                       eval_freq: int = 10000,
                       n_eval_episodes: int = 5,
                       log_path: Optional[str] = None) -> BaseCallback:
        """
        :param callback: (Union[callable, [BaseCallback], BaseCallback, None])
        :return: (BaseCallback)
        """
        # Convert a list of callbacks into a callback
        if isinstance(callback, list):
            callback = CallbackList(callback)

        # Convert functional callback to object
        if not isinstance(callback, BaseCallback):
            callback = ConvertCallback(callback)

        # Create eval callback in charge of the evaluation
        if eval_env is not None:
            eval_callback = EvalCallback(eval_env,
                                         best_model_save_path=log_path,
                                         log_path=log_path,
                                         eval_freq=eval_freq,
                                         n_eval_episodes=n_eval_episodes)
            callback = CallbackList([callback, eval_callback])

        callback.init_callback(self)
        return callback

    def _setup_learn(
        self,
        eval_env: Optional[GymEnv],
        callback: Union[None, Callable, List[BaseCallback],
                        BaseCallback] = None,
        eval_freq: int = 10000,
        n_eval_episodes: int = 5,
        log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> 'BaseCallback':
        """
        Initialize different variables needed for training.

        :param eval_env: (Optional[GymEnv])
        :param callback: (Union[None, BaseCallback, List[BaseCallback, Callable]])
        :param eval_freq: (int)
        :param n_eval_episodes: (int)
        :param log_path (Optional[str]): Path to a log folder
        :param reset_num_timesteps: (bool) Whether to reset or not the ``num_timesteps`` attribute
        :return: (BaseCallback)
        """
        self.start_time = time.time()
        self.ep_info_buffer = deque(maxlen=100)
        self.ep_success_buffer = deque(maxlen=100)

        if self.action_noise is not None:
            self.action_noise.reset()

        if reset_num_timesteps:
            self.num_timesteps = 0
            self._episode_num = 0

        # Avoid resetting the environment when calling ``.learn()`` consecutive times
        if reset_num_timesteps or self._last_obs is None:
            self._last_obs = self.env.reset()
            # Retrieve unnormalized observation for saving into the buffer
            if self._vec_normalize_env is not None:
                self._last_original_obs = self._vec_normalize_env.get_original_obs(
                )

        if eval_env is not None and self.seed is not None:
            eval_env.seed(self.seed)

        eval_env = self._get_eval_env(eval_env)

        # Create eval callback if needed
        callback = self._init_callback(callback, eval_env, eval_freq,
                                       n_eval_episodes, log_path)

        return callback

    def _update_info_buffer(self,
                            infos: List[Dict[str, Any]],
                            dones: Optional[np.ndarray] = None) -> None:
        """
        Retrieve reward and episode length and update the buffer
        if using Monitor wrapper.

        :param infos: ([dict])
        """
        if dones is None:
            dones = np.array([False] * len(infos))
        for idx, info in enumerate(infos):
            maybe_ep_info = info.get('episode')
            maybe_is_success = info.get('is_success')
            if maybe_ep_info is not None:
                self.ep_info_buffer.extend([maybe_ep_info])
            if maybe_is_success is not None and dones[idx]:
                self.ep_success_buffer.append(maybe_is_success)

    @staticmethod
    def _save_to_file_zip(save_path: str,
                          data: Dict[str, Any] = None,
                          params: Dict[str, Any] = None,
                          tensors: Dict[str, Any] = None) -> None:
        """
        Save model to a zip archive.

        :param save_path: Where to store the model
        :param data: Class parameters being stored
        :param params: Model parameters being stored expected to contain an entry for every
                       state_dict with its name and the state_dict
        :param tensors: Extra tensor variables expected to contain name and value of tensors
        """

        # data/params can be None, so do not
        # try to serialize them blindly
        if data is not None:
            serialized_data = data_to_json(data)

        # Check postfix if save_path is a string
        if isinstance(save_path, str):
            _, ext = os.path.splitext(save_path)
            if ext == "":
                save_path += ".zip"

        # Create a zip-archive and write our objects
        # there. This works when save_path is either
        # str or a file-like
        with zipfile.ZipFile(save_path, "w") as archive:
            # Do not try to save "None" elements
            if data is not None:
                archive.writestr("data", serialized_data)
            if tensors is not None:
                with archive.open('tensors.pth', mode="w") as tensors_file:
                    th.save(tensors, tensors_file)
            if params is not None:
                for file_name, dict_ in params.items():
                    with archive.open(file_name + '.pth',
                                      mode="w") as param_file:
                        th.save(dict_, param_file)

    def excluded_save_params(self) -> List[str]:
        """
        Returns the names of the parameters that should be excluded by default
        when saving the model.

        :return: ([str]) List of parameters that should be excluded from save
        """
        return [
            "policy", "device", "env", "eval_env", "replay_buffer",
            "rollout_buffer", "_vec_normalize_env"
        ]

    def save(self,
             path: str,
             exclude: Optional[List[str]] = None,
             include: Optional[List[str]] = None) -> None:
        """
        Save all the attributes of the object and the model parameters in a zip-file.

        :param path: path to the file where the rl agent should be saved
        :param exclude: name of parameters that should be excluded in addition to the default one
        :param include: name of parameters that might be excluded but should be included anyway
        """
        # copy parameter list so we don't mutate the original dict
        data = self.__dict__.copy()
        # use standard list of excluded parameters if none given
        if exclude is None:
            exclude = self.excluded_save_params()
        else:
            # append standard exclude params to the given params
            exclude.extend([
                param for param in self.excluded_save_params()
                if param not in exclude
            ])

        # do not exclude params if they are specifically included
        if include is not None:
            exclude = [
                param_name for param_name in exclude
                if param_name not in include
            ]

        state_dicts_names, tensors_names = self.get_torch_variables()
        # any params that are in the save vars must not be saved by data
        torch_variables = state_dicts_names + tensors_names
        for torch_var in torch_variables:
            # we need to get only the name of the top most module as we'll remove that
            var_name = torch_var.split('.')[0]
            exclude.append(var_name)

        # Remove parameter entries of parameters which are to be excluded
        for param_name in exclude:
            if param_name in data:
                data.pop(param_name, None)

        # Build dict of tensor variables
        tensors = None
        if tensors_names is not None:
            tensors = {}
            for name in tensors_names:
                attr = recursive_getattr(self, name)
                tensors[name] = attr

        # Build dict of state_dicts
        params_to_save = {}
        for name in state_dicts_names:
            attr = recursive_getattr(self, name)
            # Retrieve state dict
            params_to_save[name] = attr.state_dict()

        self._save_to_file_zip(path,
                               data=data,
                               params=params_to_save,
                               tensors=tensors)
Ejemplo n.º 12
0
if args.smp_bodies_aligned:
    common.args.custom_alignment = "::".join(
        [cheetah_orders[x] for x in robot_bodies])
hyperparams = common.load_hyperparameters(conf_name=args.rl_hyperparameter)

common.args.model_filename = "output_data/tmp/aligned/sd0/best_model.zip"

# for test_body in robot_bodies+zero_shot_bodies:
for test_body in ["cheetah_3_balanced"]:
    if args.smp_bodies_aligned:
        common.args.custom_alignment = cheetah_orders[test_body]
    # if test_body == "cheetah_6_front":
    #     eval_venv = DummyVecEnv([make_env(test_body)])
    # else:
    eval_venv = DummyVecEnv([make_env(test_body,
                                      rank=0)])  # 1 avoid visualization

    if args.vec_normalize:
        eval_venv = VecNormalize.load(
            common.get_vec_pkl_from_model_filename(args.model_filename),
            eval_venv)

    hyperparams = common.clean_hyperparams_before_run(hyperparams)
    model = PPO.load(common.args.model_filename, env=eval_venv, **hyperparams)

    obs = eval_venv.reset()

    while True:
        action = model.predict(obs)
        obs, reward, done, _ = eval_venv.step(action[0])
        if done:
Ejemplo n.º 13
0
    "total_timesteps": 200,
    "env_name": "CartPole-v1",
}
run = wandb.init(
    project="sb3",
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    save_code=True,  # optional
)


def make_env():
    env = gym.make(config["env_name"])
    env = Monitor(env)  # record stats such as returns
    return env


env = DummyVecEnv([make_env])
model = PPO(config["policy_type"],
            env,
            verbose=1,
            tensorboard_log=f"runs/{run.name}")

model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_path=f"models/{run.name}",
    ),
)
    def __init__(self,
                 dim_room=(10, 10),
                 num_boxes=4,
                 reset=True,
                 log_interval=1000,
                 alg_version=0,
                 train_mode='cnn',
                 agent_lb_path=None,
                 agent_ub_path=None,
                 init_probs=[0.5, 0.5, 0.5]):

        assert train_mode in TRAIN_MODES
        self.train_mode = train_mode
        if log_interval > 0:
            self.log_train_info = True
        else:
            self.log_train_info = False

        # 0: basic playable map
        # 1: playble map
        # 2: hardness adjustable map
        self.alg_version = alg_version
        if alg_version == 0:
            pass
        else:
            env_li = [
                lambda: SokobanEnv(dim_room=dim_room,
                                   max_steps=50,
                                   num_boxes=num_boxes,
                                   train_mode=train_mode,
                                   log_train_info=False)
            ]
            self.soko_env = DummyVecEnv(env_li)
            self.agent_ub = PPO.load(agent_ub_path, env=self.soko_env)
            print('loaded', agent_ub_path, 'as ub')
            if alg_version == 2:
                self.agent_lb = PPO.load(agent_lb_path, env=self.soko_env)
                print('loaded', agent_lb_path, 'as lb')

        # General Configuration
        self.dim_room = dim_room
        self.num_boxes = num_boxes
        self.num_players = 1

        # Training hyperperams
        self.max_prefer_subs = dim_room[0] * dim_room[1] // 2
        self.place_target_prob = init_probs[0]
        self.place_box_prob = init_probs[1]
        self.place_player_prob = init_probs[2]

        # Log info
        self.start_time = time.time()
        self.train_result_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
        self.fail_type_summary = {-1: 0, 0: 0, 1: 0, 2: 0}
        # self.sample_map = False
        self.episode_reward = 0
        self.total_reward_per_log_interval = 0
        self.total_steps_per_log_interval = 0
        self.total_subs_per_log_interval = 0
        self.log_interval = log_interval
        self.reseted = False
        self.train_counter = 0

        # Env properties
        self.map = None

        # Penalties and Rewards
        self.penalty_sub_wrong_tile = -5
        self.penalty_exc_btp_tiles = -10
        self.penalty_bad_map_design = -50
        self.penalty_generation_fail = -50
        self.penalty_exc_subs = -10

        self.reward_neighbor_valid_tiles = 2
        self.reward_place_btp_tiles = 5
        self.reward_basic_playable = 40

        if alg_version == 1:
            # too hard or unsolvable
            self.penalty_agent_ub_thou = -30
            self.reward_agent_ub_solvable = 50
        elif alg_version == 2:
            self.penalty_agent_lb_solvable = -30
            self.penalty_agent_ub_thou = -30
            self.reward_agent_ub_solvable = 10
            self.reward_agent_lb_thou = 50

        # Generation Track
        self.placed_player = 0
        self.placed_boxes = 0
        self.placed_target = 0
        self.env_steps = 0

        # Env Settings
        self.viewer = None
        self.max_steps = dim_room[0] * dim_room[1]
        self.action_space = MultiDiscrete([dim_room[0], dim_room[1], 5])

        if train_mode == 'cnn':
            self.scale = 6
            screen_height, screen_width = (dim_room[0] * self.scale,
                                           dim_room[1] * self.scale)
            self.observation_space = Box(low=0,
                                         high=255,
                                         shape=(screen_height, screen_width,
                                                3),
                                         dtype=np.uint8)
        else:
            self.observation_space = Box(low=0,
                                         high=6,
                                         shape=(dim_room[0], dim_room[1]),
                                         dtype=np.uint8)

        if reset:
            # Initialize Room
            _ = self.reset()
Ejemplo n.º 15
0
import gym
import time

from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback

# Create a DummyVecEnv for main airsim gym env
env = DummyVecEnv([
    lambda: Monitor(
        gym.make(
            "airgym:airsim-car-sample-v0",
            ip_address="127.0.0.1",
            image_shape=(84, 84, 1),
        ))
])

# Wrap env as VecTransposeImage to allow SB to handle frame observations
env = VecTransposeImage(env)

# Initialize RL algorithm type and parameters
model = DQN(
    "CnnPolicy",
    env,
    learning_rate=0.00025,
    verbose=1,
    batch_size=32,
    train_freq=4,
    target_update_interval=10000,
Ejemplo n.º 16
0
        if args.realign_method != "":
            default_wrapper.append(wrapper.ReAlignedWrapper)
    elif args.topology_wrapper == "diff":
        default_wrapper.append(wrapper_diff.get_wrapper_class())
    elif args.topology_wrapper == "MutantWrapper":
        default_wrapper.append(wrapper_mut.MutantWrapper)
    elif args.topology_wrapper == "CustomAlignWrapper":
        default_wrapper.append(wrapper_custom_align.CustomAlignWrapper)
    else:
        pass  # no need for wrapper

    for rank_idx, test_body in enumerate(args.test_bodies):
        eval_venv = DummyVecEnv([
            gym_interface.make_env(rank=rank_idx,
                                   seed=common.seed,
                                   wrappers=default_wrapper,
                                   force_render=args.render,
                                   robot_body=test_body,
                                   dataset_folder="../input_data/bodies")
        ])
        if args.vec_normalize:
            raise NotImplementedError
            # normalize_kwargs["gamma"] = hyperparams["gamma"]
            # eval_venv = VecNormalize(eval_venv, **normalize_kwargs)

        if args.stack_frames > 1:
            eval_venv = VecFrameStack(eval_venv, args.stack_frames)

        eval_venv.seed(common.seed)
        model = PPO.load(args.model_filename)

        obs = eval_venv.reset()
Ejemplo n.º 17
0
def run_ensemble_strategy(df, unique_trade_date, rebalance_window, validation_window) -> None:
    """Ensemble Strategy that combines PPO, A2C and DDPG"""
    print("============Start Ensemble Strategy============")
    # for ensemble model, it's necessary to feed the last state
    # of the previous model to the current model as the initial state
    last_state_ensemble = []

    ppo_sharpe_list = []
    ddpg_sharpe_list = []
    a2c_sharpe_list = []

    model_use = []

    # based on the analysis of the in-sample data
    #turbulence_threshold = 140
    insample_turbulence = df[(df.datadate<20151000) & (df.datadate>=20090000)]
    insample_turbulence = insample_turbulence.drop_duplicates(subset=['datadate'])
    insample_turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, .90)

    start = time.time()
    for i in range(rebalance_window + validation_window, len(unique_trade_date), rebalance_window):
        print("============================================")
        ## initial state is empty
        if i - rebalance_window - validation_window == 0:
            # inital state
            initial = True
        else:
            # previous state
            initial = False

        # Tuning trubulence index based on historical data
        # Turbulence lookback window is one quarter
        end_date_index = df.index[df["datadate"] == unique_trade_date[i - rebalance_window - validation_window]].to_list()[-1]
        start_date_index = end_date_index - validation_window*30 + 1

        historical_turbulence = df.iloc[start_date_index:(end_date_index + 1), :]
        #historical_turbulence = df[(df.datadate<unique_trade_date[i - rebalance_window - validation_window]) & (df.datadate>=(unique_trade_date[i - rebalance_window - validation_window - 63]))]


        historical_turbulence = historical_turbulence.drop_duplicates(subset=['datadate'])

        historical_turbulence_mean = np.mean(historical_turbulence.turbulence.values)

        if historical_turbulence_mean > insample_turbulence_threshold:
            # if the mean of the historical data is greater than the 90% quantile of insample turbulence data
            # then we assume that the current market is volatile,
            # therefore we set the 90% quantile of insample turbulence data as the turbulence threshold
            # meaning the current turbulence can't exceed the 90% quantile of insample turbulence data
            turbulence_threshold = insample_turbulence_threshold
        else:
            # if the mean of the historical data is less than the 90% quantile of insample turbulence data
            # then we tune up the turbulence_threshold, meaning we lower the risk
            turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, 1)
        print("turbulence_threshold: ", turbulence_threshold)

        ############## Environment Setup starts ##############
        ## training env
        train = data_split(df, start=20090000, end=unique_trade_date[i - rebalance_window - validation_window])
        env_train = DummyVecEnv([lambda: StockEnvTrain(train)])

        ## validation env
        validation = data_split(df, start=unique_trade_date[i - rebalance_window - validation_window],
                                end=unique_trade_date[i - rebalance_window])
        env_val = DummyVecEnv([lambda: StockEnvValidation(validation,
                                                          turbulence_threshold=turbulence_threshold,
                                                          iteration=i)])
        obs_val = env_val.reset()
        ############## Environment Setup ends ##############

        ############## Training and Validation starts ##############
        print("======Model training from: ", 20090000, "to ",
              unique_trade_date[i - rebalance_window - validation_window])
        # print("training: ",len(data_split(df, start=20090000, end=test.datadate.unique()[i-rebalance_window]) ))
        # print("==============Model Training===========")
        print("======A2C Training========")
        model_a2c = train_A2C(env_train, model_name="A2C_30k_dow_{}".format(i), timesteps=30000)
        print("======A2C Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ",
              unique_trade_date[i - rebalance_window])
        DRL_validation(model=model_a2c, test_data=validation, test_env=env_val, test_obs=obs_val)
        sharpe_a2c = get_validation_sharpe(i)
        print("A2C Sharpe Ratio: ", sharpe_a2c)

        print("======PPO Training========")
        model_ppo = train_PPO(env_train, model_name="PPO_100k_dow_{}".format(i), timesteps=100000)
        print("======PPO Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ",
              unique_trade_date[i - rebalance_window])
        DRL_validation(model=model_ppo, test_data=validation, test_env=env_val, test_obs=obs_val)
        sharpe_ppo = get_validation_sharpe(i)
        print("PPO Sharpe Ratio: ", sharpe_ppo)

        print("======DDPG Training========")
        model_ddpg = train_DDPG(env_train, model_name="DDPG_10k_dow_{}".format(i), timesteps=10000)
        #model_ddpg = train_TD3(env_train, model_name="DDPG_10k_dow_{}".format(i), timesteps=20000)
        print("======DDPG Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ",
              unique_trade_date[i - rebalance_window])
        DRL_validation(model=model_ddpg, test_data=validation, test_env=env_val, test_obs=obs_val)
        sharpe_ddpg = get_validation_sharpe(i)

        ppo_sharpe_list.append(sharpe_ppo)
        a2c_sharpe_list.append(sharpe_a2c)
        ddpg_sharpe_list.append(sharpe_ddpg)

        # Model Selection based on sharpe ratio
        if (sharpe_ppo >= sharpe_a2c) & (sharpe_ppo >= sharpe_ddpg):
            model_ensemble = model_ppo
            model_use.append('PPO')
        elif (sharpe_a2c > sharpe_ppo) & (sharpe_a2c > sharpe_ddpg):
            model_ensemble = model_a2c
            model_use.append('A2C')
        else:
            model_ensemble = model_ddpg
            model_use.append('DDPG')
        ############## Training and Validation ends ##############

        ############## Trading starts ##############
        print("======Trading from: ", unique_trade_date[i - rebalance_window], "to ", unique_trade_date[i])
        #print("Used Model: ", model_ensemble)
        last_state_ensemble = DRL_prediction(df=df, model=model_ensemble, name="ensemble",
                                             last_state=last_state_ensemble, iter_num=i,
                                             unique_trade_date=unique_trade_date,
                                             rebalance_window=rebalance_window,
                                             turbulence_threshold=turbulence_threshold,
                                             initial=initial)
        # print("============Trading Done============")
        ############## Trading ends ##############

    end = time.time()
    print("Ensemble Strategy took: ", (end - start) / 60, " minutes")
Ejemplo n.º 18
0
        },
    })
    env.reset()
    return env


def test_env():
    env = train_env()
    env.configure({"policy_frequency": 15, "duration": 20 * 15})
    env.reset()
    return env


if __name__ == '__main__':
    # Train
    model = DQN('CnnPolicy', DummyVecEnv([train_env]),
                learning_rate=5e-4,
                buffer_size=15000,
                learning_starts=200,
                batch_size=32,
                gamma=0.8,
                train_freq=1,
                gradient_steps=1,
                target_update_interval=50,
                exploration_fraction=0.7,
                verbose=1,
                tensorboard_log="highway_cnn/")
    model.learn(total_timesteps=int(1e5))
    model.save("highway_cnn/model")

    # Record video
Ejemplo n.º 19
0
    # if args.cnspns:
    # hard code for now. could be automatically determined.
    # _w = wrapper_pns.make_same_dim_wrapper(obs_dim=28, action_dim=8)
    # default_wrapper.append(_w)

    assert len(args.train_bodies) > 0, "No body to train."
    if args.with_bodyinfo:
        default_wrapper.append(wrapper.BodyinfoWrapper)

    print("Making train environments...")
    venv = DummyVecEnv([
        gym_interface.make_env(
            rank=i,
            seed=common.seed,
            wrappers=default_wrapper,
            render=args.render,
            robot_body=args.train_bodies[i % len(args.train_bodies)],
            dataset_folder=args.body_folder) for i in range(args.num_venvs)
    ])

    normalize_kwargs = {}
    if args.vec_normalize:
        normalize_kwargs["gamma"] = hyperparams["gamma"]
        if len(args.model_filename) > 0:
            venv = VecNormalize.load(
                common.get_vec_pkl_from_model_filename(args.model_filename),
                venv)
        else:
            venv = VecNormalize(venv, **normalize_kwargs)
Ejemplo n.º 20
0
 def setup_test(self):
     env_fun = my_utils.import_env(env_config["env_name"])
     self.env = DummyVecEnv([lambda: env_fun(config)])
     self.policy = my_utils.make_par_policy(self.env, config)
     self.policy.load_state_dict(T.load(config["test_agent_path"]))
Ejemplo n.º 21
0
def make_vec_env(
    env_name: str,
    n_envs: int = 8,
    seed: int = 0,
    parallel: bool = False,
    log_dir: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
    post_wrappers: Optional[Sequence[Callable[[gym.Env, int], gym.Env]]] = None,
) -> VecEnv:
    """Returns a VecEnv initialized with `n_envs` Envs.

    Args:
        env_name: The Env's string id in Gym.
        n_envs: The number of duplicate environments.
        seed: The environment seed.
        parallel: If True, uses SubprocVecEnv; otherwise, DummyVecEnv.
        log_dir: If specified, saves Monitor output to this directory.
        max_episode_steps: If specified, wraps each env in a TimeLimit wrapper
            with this episode length. If not specified and `max_episode_steps`
            exists for this `env_name` in the Gym registry, uses the registry
            `max_episode_steps` for every TimeLimit wrapper (this automatic
            wrapper is the default behavior when calling `gym.make`). Otherwise
            the environments are passed into the VecEnv unwrapped.
        post_wrappers: If specified, iteratively wraps each environment with each
            of the wrappers specified in the sequence. The argument should be a Callable
            accepting two arguments, the Env to be wrapped and the environment index,
            and returning the wrapped Env.
    """
    # Resolve the spec outside of the subprocess first, so that it is available to
    # subprocesses running `make_env` via automatic pickling.
    #spec = gym.spec(env_name)

    def make_env(i, this_seed):
        # Previously, we directly called `gym.make(env_name)`, but running
        # `imitation.scripts.train_adversarial` within `imitation.scripts.parallel`
        # created a weird interaction between Gym and Ray -- `gym.make` would fail
        # inside this function for any of our custom environment unless those
        # environments were also `gym.register()`ed inside `make_env`. Even
        # registering the custom environment in the scope of `make_vec_env` didn't
        # work. For more discussion and hypotheses on this issue see PR #160:
        # https://github.com/HumanCompatibleAI/imitation/pull/160.

        #env = spec.make()
        target_machine_ip = '127.0.0.1'
        # for a simulated robot environment
        env = gym.make(env_name, ip=target_machine_ip, gui=False)
        env = ExceptionHandling(env)


        # Seed each environment with a different, non-sequential seed for diversity
        # (even if caller is passing us sequentially-assigned base seeds). int() is
        # necessary to work around gym bug where it chokes on numpy int64s.
        env.seed(int(this_seed))

        if max_episode_steps is not None:
            env = TimeLimit(env, max_episode_steps)
        elif spec.max_episode_steps is not None:
            env = TimeLimit(env, max_episode_steps=spec.max_episode_steps)

        # Use Monitor to record statistics needed for Baselines algorithms logging
        # Optionally, save to disk
        log_path = None
        if log_dir is not None:
            log_subdir = os.path.join(log_dir, "monitor")
            os.makedirs(log_subdir, exist_ok=True)
            log_path = os.path.join(log_subdir, f"mon{i:03d}")

        env = monitor.Monitor(env, log_path)
        env = wrappers.RolloutInfoWrapper(env)

        if post_wrappers:
            for wrapper in post_wrappers:
                env = wrapper(env, i)

        return env

    rng = np.random.RandomState(seed)
    env_seeds = rng.randint(0, (1 << 31) - 1, (n_envs,))
    env_fns = [functools.partial(make_env, i, s) for i, s in enumerate(env_seeds)]
    if parallel:
        # See GH hill-a/stable-baselines issue #217
        return SubprocVecEnv(env_fns, start_method="forkserver")
    else:
        return DummyVecEnv(env_fns)
Ejemplo n.º 22
0
def test_save_load_policy(tmp_path, model_class, policy_str):
    """
    Test saving and loading policy only.

    :param model_class: (BaseAlgorithm) A RL model
    :param policy_str: (str) Name of the policy.
    """
    kwargs = {}
    if policy_str == "MlpPolicy":
        env = select_env(model_class)
    else:
        if model_class in [SAC, TD3, DQN]:
            # Avoid memory error when using replay buffer
            # Reduce the size of the features
            kwargs = dict(buffer_size=250)
        env = FakeImageEnv(screen_height=40,
                           screen_width=40,
                           n_channels=2,
                           discrete=model_class == DQN)

    env = DummyVecEnv([lambda: env])

    # create model
    model = model_class(policy_str,
                        env,
                        policy_kwargs=dict(net_arch=[16]),
                        verbose=1,
                        **kwargs)
    model.learn(total_timesteps=500, eval_freq=250)

    env.reset()
    observations = np.concatenate(
        [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)

    policy = model.policy
    policy_class = policy.__class__
    actor, actor_class = None, None
    if model_class in [SAC, TD3]:
        actor = policy.actor
        actor_class = actor.__class__

    # Get dictionary of current parameters
    params = deepcopy(policy.state_dict())

    # Modify all parameters to be random values
    random_params = dict((param_name, th.rand_like(param))
                         for param_name, param in params.items())

    # Update model parameters with the new random values
    policy.load_state_dict(random_params)

    new_params = policy.state_dict()
    # Check that all params are different now
    for k in params:
        assert not th.allclose(
            params[k], new_params[k]), "Parameters did not change as expected."

    params = new_params

    # get selected actions
    selected_actions, _ = policy.predict(observations, deterministic=True)
    # Should also work with the actor only
    if actor is not None:
        selected_actions_actor, _ = actor.predict(observations,
                                                  deterministic=True)

    # Save and load policy
    policy.save(tmp_path / "policy.pkl")
    # Save and load actor
    if actor is not None:
        actor.save(tmp_path / "actor.pkl")

    del policy, actor

    policy = policy_class.load(tmp_path / "policy.pkl")
    if actor_class is not None:
        actor = actor_class.load(tmp_path / "actor.pkl")

    # check if params are still the same after load
    new_params = policy.state_dict()

    # Check that all params are the same as before save load procedure now
    for key in params:
        assert th.allclose(
            params[key], new_params[key]
        ), "Policy parameters not the same after save and load."

    # check if model still selects the same actions
    new_selected_actions, _ = policy.predict(observations, deterministic=True)
    assert np.allclose(selected_actions, new_selected_actions, 1e-4)

    if actor_class is not None:
        new_selected_actions_actor, _ = actor.predict(observations,
                                                      deterministic=True)
        assert np.allclose(selected_actions_actor, new_selected_actions_actor,
                           1e-4)
        assert np.allclose(selected_actions_actor, new_selected_actions, 1e-4)

    # clear file from os
    os.remove(tmp_path / "policy.pkl")
    if actor_class is not None:
        os.remove(tmp_path / "actor.pkl")
Ejemplo n.º 23
0
 def get_sb_env(self):
     e = DummyVecEnv([lambda: self])
     obs = e.reset()
     return e, obs
Ejemplo n.º 24
0
def test_save_load(tmp_path, model_class):
    """
    Test if 'save' and 'load' saves and loads model correctly
    and if 'load_parameters' and 'get_policy_parameters' work correctly

    ''warning does not test function of optimizer parameter load

    :param model_class: (BaseAlgorithm) A RL model
    """

    env = DummyVecEnv([lambda: select_env(model_class)])

    # create model
    model = model_class("MlpPolicy",
                        env,
                        policy_kwargs=dict(net_arch=[16]),
                        verbose=1)
    model.learn(total_timesteps=500, eval_freq=250)

    env.reset()
    observations = np.concatenate(
        [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)

    # Get dictionary of current parameters
    params = deepcopy(model.policy.state_dict())

    # Modify all parameters to be random values
    random_params = dict((param_name, th.rand_like(param))
                         for param_name, param in params.items())

    # Update model parameters with the new random values
    model.policy.load_state_dict(random_params)

    new_params = model.policy.state_dict()
    # Check that all params are different now
    for k in params:
        assert not th.allclose(
            params[k], new_params[k]), "Parameters did not change as expected."

    params = new_params

    # get selected actions
    selected_actions, _ = model.predict(observations, deterministic=True)

    # Check
    model.save(tmp_path / "test_save.zip")
    del model
    model = model_class.load(str(tmp_path / "test_save.zip"), env=env)

    # check if params are still the same after load
    new_params = model.policy.state_dict()

    # Check that all params are the same as before save load procedure now
    for key in params:
        assert th.allclose(
            params[key], new_params[key]
        ), "Model parameters not the same after save and load."

    # check if model still selects the same actions
    new_selected_actions, _ = model.predict(observations, deterministic=True)
    assert np.allclose(selected_actions, new_selected_actions, 1e-4)

    # check if learn still works
    model.learn(total_timesteps=1000, eval_freq=500)

    # clear file from os
    os.remove(tmp_path / "test_save.zip")
Ejemplo n.º 25
0
        env = wrap_pytorch(
            wrap_deepmind(
                env,
                clip_rewards=True,
                frame_stack=True,
                scale=False,
            ))
        env.seed(seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env

    return thunk


envs = DummyVecEnv(
    [make_env(args.gym_id, args.seed + i, i) for i in range(args.num_envs)])
# if args.prod_mode:
#     envs = VecPyTorch(
#         SubprocVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)], "fork"),
#         device
#     )
assert isinstance(envs.action_space,
                  Discrete), "only discrete action space is supported"


# ALGO LOGIC: initialize agent here:
class Scale(nn.Module):
    def __init__(self, scale):
        super().__init__()
        self.scale = scale
Ejemplo n.º 26
0
def create_env(n_envs, eval_env=False, no_log=False):
    """
    Create the environment and wrap it if necessary
    :param n_envs: (int)
    :param eval_env: (bool) Whether is it an environment used for evaluation or not
    :param no_log: (bool) Do not log training when doing hyperparameter optim
        (issue with writing the same file)
    :return: (Union[gym.Env, VecEnv])
    """
    global hyperparams
    global env_kwargs, eval_env_kwargs
    global normalize

    if eval_env:
        kwargs = eval_env_kwargs
    else:
        kwargs = env_kwargs

    # Do not log eval env (issue with writing the same file)
    log_dir = None if eval_env or no_log else save_path

    if n_envs == 1:
        # use rank=127 so eval_env won't overlap with any training_env.
        env = DummyVecEnv([
            make_env(env_id,
                     127,
                     args.seed,
                     wrapper_class=env_wrapper,
                     log_dir=log_dir,
                     env_kwargs=kwargs[0])
        ])
    else:
        # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
        # On most env, SubprocVecEnv does not help and is quite memory hungry
        env = DummyVecEnv([
            make_env(env_id,
                     i,
                     args.seed,
                     log_dir=log_dir,
                     env_kwargs=kwargs[i],
                     wrapper_class=env_wrapper) for i in range(n_envs)
        ])

    if normalize:
        # Copy to avoid changing default values by reference
        local_normalize_kwargs = normalize_kwargs.copy()
        # Do not normalize reward for env used for evaluation
        if eval_env:
            if len(local_normalize_kwargs) > 0:
                local_normalize_kwargs["norm_reward"] = False
            else:
                local_normalize_kwargs = {"norm_reward": False}

        if args.verbose > 0:
            if len(local_normalize_kwargs) > 0:
                print(f"Normalization activated: {local_normalize_kwargs}")
            else:
                print("Normalizing input and reward")
        env = VecNormalize(env, **local_normalize_kwargs)

    return env
    args = common.args
    args.model_filename = "output_data/tmp/best_model.zip"
    args.test_bodies = [320]
    args.stack_frames = 4
    args.test_steps = 1000
    args.render = True
    print(args)

    default_wrapper = [wrapper.WalkerWrapper]

    assert len(args.train_bodies) == 0, "No need for body to train."
    if args.with_bodyinfo:
        default_wrapper += [wrapper.BodyinfoWrapper]

    for test_body in args.test_bodies:
        eval_venv = DummyVecEnv([gym_interface.make_env(rank=0, seed=common.seed, wrappers=default_wrapper, render=args.render,
                                                        robot_body=test_body)])
        if args.vec_normalize:
            raise NotImplementedError
            # normalize_kwargs["gamma"] = hyperparams["gamma"]
            # eval_venv = VecNormalize(eval_venv, **normalize_kwargs)

        if args.stack_frames > 1:
            eval_venv = VecFrameStack(eval_venv, args.stack_frames)

        eval_venv.seed(common.seed)
        model = PPO.load(args.model_filename)

        obs = eval_venv.reset()
        g_obs_data = np.zeros(shape=[args.test_steps, obs.shape[1]], dtype=np.float32)

        if True:
Ejemplo n.º 28
0
        default_wrapper.append(wrapper_mut.MutantWrapper)
    elif args.topology_wrapper == "CustomAlignWrapper":
        default_wrapper.append(wrapper_custom_align.CustomAlignWrapper)
    else:
        pass  # no need for wrapper

    assert len(args.robo_bodies) > 0, "No body to train."
    if args.with_bodyinfo:
        default_wrapper.append(wrapper.BodyinfoWrapper)

    print("Making train environments...")
    venv = DummyVecEnv([
        gym_interface.make_pyrobotdesign_env(
            rank=i,
            seed=common.seed,
            wrappers=default_wrapper,
            render=args.render,
            dataset_folder=args.dataset_folder,
            robo_body=args.robo_bodies[i % len(args.robo_bodies)])
        for i in range(args.num_venvs)
    ])

    normalize_kwargs = {}
    if args.vec_normalize:
        normalize_kwargs["gamma"] = hyperparams["gamma"]
        venv = VecNormalize(venv, **normalize_kwargs)

    if args.stack_frames > 1:
        venv = VecFrameStack(venv, args.stack_frames)

    keys_remove = ["normalize", "n_envs", "n_timesteps", "policy"]
    for key in keys_remove:
def test_save_load_q_net(tmp_path, model_class, policy_str):
    """
    Test saving and loading q-network/quantile net only.

    :param model_class: (BaseAlgorithm) A RL model
    :param policy_str: (str) Name of the policy.
    """
    kwargs = dict(policy_kwargs=dict(net_arch=[16]))
    if policy_str == "MlpPolicy":
        env = select_env(model_class)
    else:
        if model_class in [QRDQN]:
            # Avoid memory error when using replay buffer
            # Reduce the size of the features
            kwargs = dict(
                buffer_size=250,
                learning_starts=100,
                policy_kwargs=dict(features_extractor_kwargs=dict(
                    features_dim=32)),
            )
        env = FakeImageEnv(screen_height=40,
                           screen_width=40,
                           n_channels=2,
                           discrete=model_class == QRDQN)

    # Reduce number of quantiles for faster tests
    if model_class in [QRDQN]:
        kwargs["policy_kwargs"].update(dict(n_quantiles=20))

    env = DummyVecEnv([lambda: env])

    # create model
    model = model_class(policy_str, env, verbose=1, **kwargs)
    model.learn(total_timesteps=300)

    env.reset()
    observations = np.concatenate(
        [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)

    q_net = model.quantile_net
    q_net_class = q_net.__class__

    # Get dictionary of current parameters
    params = deepcopy(q_net.state_dict())

    # Modify all parameters to be random values
    random_params = dict((param_name, th.rand_like(param))
                         for param_name, param in params.items())

    # Update model parameters with the new random values
    q_net.load_state_dict(random_params)

    new_params = q_net.state_dict()
    # Check that all params are different now
    for k in params:
        assert not th.allclose(
            params[k], new_params[k]), "Parameters did not change as expected."

    params = new_params

    # get selected actions
    selected_actions, _ = q_net.predict(observations, deterministic=True)

    # Save and load q_net
    q_net.save(tmp_path / "q_net.pkl")

    del q_net

    q_net = q_net_class.load(tmp_path / "q_net.pkl")

    # check if params are still the same after load
    new_params = q_net.state_dict()

    # Check that all params are the same as before save load procedure now
    for key in params:
        assert th.allclose(
            params[key], new_params[key]
        ), "Policy parameters not the same after save and load."

    # check if model still selects the same actions
    new_selected_actions, _ = q_net.predict(observations, deterministic=True)
    assert np.allclose(selected_actions, new_selected_actions, 1e-4)

    # clear file from os
    os.remove(tmp_path / "q_net.pkl")
Ejemplo n.º 30
0
    training_bodies = [int(x) for x in args.train_bodies.split(",")]
    str_ids = "-".join(str(x) for x in training_bodies)
    if args.test_bodies=="":
        test_bodies = []
    else:
        test_bodies = [int(x) for x in args.test_bodies.split(",")]
    
    # default_wrapper = wrapper.BodyinfoWrapper
    # if args.disable_wrapper:
    #     default_wrapper = None
    default_wrapper = wrapper.WalkerWrapper
    # default_wrapper = None

    if with_bodyinfo:
        env = DummyVecEnv([utils.make_env(template=utils.template(training_bodies[i%len(training_bodies)]), rank=i, seed=utils.seed, wrapper=default_wrapper, render=args.render, robot_body=training_bodies[i%len(training_bodies)], body_info=training_bodies[i%len(training_bodies)]//100) for i in range(train_num_envs)])
        save_filename = f"model-ant-{str_ids}-with-bodyinfo"
    else:
        env = DummyVecEnv([utils.make_env(template=utils.template(training_bodies[i%len(training_bodies)]), rank=i, seed=utils.seed, wrapper=default_wrapper, render=args.render, robot_body=training_bodies[i%len(training_bodies)], body_info=0) for i in range(train_num_envs)])
        save_filename = f"model-ant-{str_ids}"

    if args.vec_normalize:
        env = VecNormalize(env, **normalize_kwargs)

    if args.stack_frames>1:
        env = VecFrameStack(env, args.stack_frames)


    keys_remove =["normalize", "n_envs", "n_timesteps", "policy"]
    for key in keys_remove:
        del hyperparams[key]