Ejemplo n.º 1
0
    def __init__(
        self,
        observation_space: gym.Space,
        action_space: gym.Space,
        policy: Type[policies.BasePolicy],
        expert_data: Union[Iterable[Mapping], types.TransitionsMinimal,
                           None] = None,
        optimizer_cls: Type[th.optim.Optimizer] = th.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None,
        ent_weight: float = 1e-3,
        l2_weight: float = 0.0,
        device: Union[str, th.device] = "auto",
    ):
        """Behavioral cloning (BC).

        Recovers a policy via supervised learning on observation-action Tensor
        pairs, sampled from a Torch DataLoader or any Iterator that ducktypes
        `torch.utils.data.DataLoader`.

        Args:
            observation_space: the observation space of the environment.
            action_space: the action space of the environment.
            policy: the policy to be trained.
            expert_data: If not None, then immediately call
                  `self.set_expert_data_loader(expert_data)` during initialization.
            optimizer_cls: optimiser to use for supervised training.
            optimizer_kwargs: keyword arguments, excluding learning rate and
                  weight decay, for optimiser construction.
            ent_weight: scaling applied to the policy's entropy regularization.
            l2_weight: scaling applied to the policy's L2 regularization.
            device: name/identity of device to place policy on.
        """
        if optimizer_kwargs:
            if "weight_decay" in optimizer_kwargs:
                raise ValueError(
                    "Use the parameter l2_weight instead of weight_decay.")

        self.action_space = action_space
        self.observation_space = observation_space
        self.device = device = utils.get_device(device)
        self.device = utils.get_device(device)

        self.policy = policy.policy.to(self.device)
        # pytype: disable=not-instantiable
        optimizer_kwargs = optimizer_kwargs or {}
        self.optimizer = optimizer_cls(self.policy.parameters(),
                                       **optimizer_kwargs)

        self.expert_data_loader: Optional[Iterable[Mapping]] = None
        self.ent_weight = ent_weight
        self.l2_weight = l2_weight

        if expert_data is not None:
            self.set_expert_data_loader(expert_data)
    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 update_masks: Dict[str, Tuple[List[int], int]],
                 device: Union[torch.device, str] = "auto",
                 cached: bool = False,
                 bias: bool = True,
                 **kwargs):

        kwargs.setdefault('aggr', 'add')
        super(NerveNetConv, self).__init__(**kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.update_masks = update_masks
        self.use_bias = bias
        self.cached = cached
        self.device = get_device(device)

        self._cached_edge_index = None
        self._cached_adj_t = None

        self.update_models_parameter = {}
        for group_name, _ in update_masks.items():
            self.update_models_parameter[group_name] = {}
            self.update_models_parameter[group_name]["weights"] = Parameter(
                torch.Tensor(in_channels, out_channels)).to(self.device)

            if self.use_bias:
                self.update_models_parameter[group_name]["bias"] = Parameter(
                    torch.Tensor(out_channels)).to(self.device)
            else:
                self.update_models_parameter[group_name]["bias"] = None

        self.reset_parameters()
Ejemplo n.º 3
0
    def load(cls, path: str, device: Union[th.device, str] = "auto") -> "BaseModel":
        """
        Load model from path.

        :param path:
        :param device: Device on which the policy should be loaded.
        :return:
        """
        device = get_device(device)
        saved_variables = th.load(path, map_location=device)

        # Allow to load policy saved with older version of SB3
        if "sde_net_arch" in saved_variables["data"]:
            warnings.warn(
                "sde_net_arch is deprecated, please downgrade to SB3 v1.2.0 if you need such parameter.",
                DeprecationWarning,
            )
            del saved_variables["data"]["sde_net_arch"]

        # Create policy object
        model = cls(**saved_variables["data"])  # pytype: disable=not-instantiable
        # Load weights
        model.load_state_dict(saved_variables["state_dict"])
        model.to(device)
        return model
Ejemplo n.º 4
0
    def __init__(
        self,
        in_features_dim: gym.spaces.Space,
        hidden_layer_size: int = 32,
        n_lstm_cells: int = 16,
        batch_size: int = 1024,
        orthogonal_init: bool = False,
        activation_fn: Type[nn.Module] = nn.Tanh,
        device: Union[th.device, str] = "gpu",
    ):
        super(LSTM, self).__init__()
        self.device = get_device(device)
        self.hidden_layer_size = hidden_layer_size
        self.n_lstm_cells = n_lstm_cells
        self.lstm = []
        self.batch_size = batch_size
        last_layer_dim_shared = hidden_layer_size

        first_input_size = in_features_dim

        self.lstm = nn.LSTM(first_input_size,
                            hidden_layer_size,
                            num_layers=n_lstm_cells)
        self.linear = nn.Linear(hidden_layer_size, hidden_layer_size)

        self.hidden = self.init_hidden(self.batch_size)
Ejemplo n.º 5
0
    def __init__(
        self,
        observation_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        device: Union[th.device, str] = "auto",
        features_extractor_class: Type[
            BaseFeaturesExtractor] = FlattenExtractor,
        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
        features_extractor: Optional[nn.Module] = None,
        normalize_images: bool = True,
        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None,
    ):
        super(BaseModel, self).__init__()

        if optimizer_kwargs is None:
            optimizer_kwargs = {}

        if features_extractor_kwargs is None:
            features_extractor_kwargs = {}

        self.observation_space = observation_space
        self.action_space = action_space
        self.device = get_device(device)
        self.features_extractor = features_extractor
        self.normalize_images = normalize_images

        self.optimizer_class = optimizer_class
        self.optimizer_kwargs = optimizer_kwargs
        self.optimizer = None  # type: Optional[th.optim.Optimizer]

        self.features_extractor_class = features_extractor_class
        self.features_extractor_kwargs = features_extractor_kwargs
Ejemplo n.º 6
0
    def __init__(
        self,
        feature_dim: int,
        net_arch: List[Union[int, Dict[str, List[int]]]],
        activation_fn: Type[nn.Module],
        device: Union[th.device, str] = "auto",
    ):
        super(MlpExtractor, self).__init__()
        device = get_device(device)
        shared_net, policy_net, value_net = [], [], []
        policy_only_layers = []  # Layer sizes of the network that only belongs to the policy network
        value_only_layers = []  # Layer sizes of the network that only belongs to the value network
        last_layer_dim_shared = feature_dim

        # Iterate through the shared layers and build the shared parts of the network
        for idx, layer in enumerate(net_arch):
            if isinstance(layer, int):  # Check that this is a shared layer
                layer_size = layer
                # TODO: give layer a meaningful name
                shared_net.append(nn.Linear(last_layer_dim_shared, layer_size))
                shared_net.append(activation_fn())
                last_layer_dim_shared = layer_size
            else:
                if not isinstance(layer, dict): continue
                #assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts"
                if "pi" in layer:
                    assert isinstance(layer["pi"], list), "Error: net_arch[-1]['pi'] must contain a list of integers."
                    policy_only_layers = layer["pi"]

                if "vf" in layer:
                    assert isinstance(layer["vf"], list), "Error: net_arch[-1]['vf'] must contain a list of integers."
                    value_only_layers = layer["vf"]
                break  # From here on the network splits up in policy and value network

        last_layer_dim_pi = last_layer_dim_shared
        last_layer_dim_vf = last_layer_dim_shared

        # Build the non-shared part of the network
        for idx, (pi_layer_size, vf_layer_size) in enumerate(zip_longest(policy_only_layers, value_only_layers)):
            if pi_layer_size is not None:
                assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers."
                policy_net.append(nn.Linear(last_layer_dim_pi, pi_layer_size))
                policy_net.append(activation_fn())
                last_layer_dim_pi = pi_layer_size

            if vf_layer_size is not None:
                assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers."
                value_net.append(nn.Linear(last_layer_dim_vf, vf_layer_size))
                value_net.append(activation_fn())
                last_layer_dim_vf = vf_layer_size

        # Save dim, used to create the distributions
        self.latent_dim_pi = last_layer_dim_pi
        self.latent_dim_vf = last_layer_dim_vf

        # Create networks
        # If the list of layers is empty, the network will just act as an Identity module
        self.shared_net = nn.Sequential(*shared_net).to(device)
        self.policy_net = nn.Sequential(*policy_net).to(device)
        self.value_net = nn.Sequential(*value_net).to(device)
Ejemplo n.º 7
0
def test_predict(model_class, env_id, device):
    if device == "cuda" and not th.cuda.is_available():
        pytest.skip("CUDA not available")

    if env_id == "CartPole-v1":
        if model_class in [SAC, TD3]:
            return
    elif model_class in [DQN]:
        return

    # Test detection of different shapes by the predict method
    model = model_class("MlpPolicy", env_id, device=device)
    # Check that the policy is on the right device
    assert get_device(device).type == model.policy.device.type

    env = gym.make(env_id)
    vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)])

    obs = env.reset()
    action, _ = model.predict(obs)
    assert action.shape == env.action_space.shape
    assert env.action_space.contains(action)

    vec_env_obs = vec_env.reset()
    action, _ = model.predict(vec_env_obs)
    assert action.shape[0] == vec_env_obs.shape[0]
Ejemplo n.º 8
0
    def device(self) -> th.device:
        """Infer which device this policy lives on by inspecting its parameters.
        If it has no parameters, the 'cpu' device is used as a fallback.

        :return:"""
        for param in self.parameters():
            return param.device
        return get_device("cpu")
Ejemplo n.º 9
0
    def device(self) -> torch.device:
        """Infer which device this policy lives on by inspecting its parameters.
        If it has no parameters, the 'auto' device is used as a fallback.

        Note: The BasePolicy class for some reason returns cpu and not auto as fallback.
        However, when we use the FlattenExtractor as FeatureExtractor Network there won't have
        been any parameters defined from which we could infere the correct device, always leading to the fallback.
        Which means, we wouldn't be able to use the GPU if we also use FlattenExtractor
        :return:"""
        for param in self.parameters():
            return param.device
        return get_device("auto")
Ejemplo n.º 10
0
    def load(cls, path: str, device: Union[th.device, str] = "auto") -> "BaseModel":
        """
        Load model from path.

        :param path:
        :param device: Device on which the policy should be loaded.
        :return:
        """
        device = get_device(device)
        saved_variables = th.load(path, map_location=device)
        # Create policy object
        model = cls(**saved_variables["data"])  # pytype: disable=not-instantiable
        # Load weights
        model.load_state_dict(saved_variables["state_dict"])
        model.to(device)
        return model
Ejemplo n.º 11
0
def reconstruct_policy(
    policy_path: str,
    device: Union[th.device, str] = "auto",
) -> policies.BasePolicy:
    """Reconstruct a saved policy.

    Args:
        policy_path: path where `.save_policy()` has been run.
        device: device on which to load the policy.

    Returns:
        policy: policy with reloaded weights.
    """
    policy = th.load(policy_path, map_location=utils.get_device(device))
    assert isinstance(policy, policies.BasePolicy)
    return policy
Ejemplo n.º 12
0
def reconstruct_trainer(
        scratch_dir: str,
        device: Union[th.device, str] = "auto") -> "DAggerTrainer":
    """Reconstruct trainer from the latest snapshot in some working directory.

    Args:
      scratch_dir: path to the working directory created by a previous run of
        this algorithm. The directory should contain `checkpoint-latest.pt` and
        `policy-latest.pt` files.
      device: device on which to load the trainer.

    Returns:
      trainer: a reconstructed `DAggerTrainer` with the same state as the
        previously-saved one.
    """
    checkpoint_path = os.path.join(scratch_dir, "checkpoint-latest.pt")
    return th.load(checkpoint_path, map_location=utils.get_device(device))
Ejemplo n.º 13
0
    def load(cls,
             path: str,
             device: Union[th.device, str] = 'auto') -> 'BasePolicy':
        """
        Load policy from path.

        :param path: (str)
        :param device: ( Union[th.device, str]) Device on which the policy should be loaded.
        :return: (BasePolicy)
        """
        device = get_device(device)
        saved_variables = th.load(path, map_location=device)
        # Create policy object
        model = cls(**saved_variables['data'])
        # Load weights
        model.load_state_dict(saved_variables['state_dict'])
        model.to(device)
        return model
Ejemplo n.º 14
0
    def __init__(
        self,
        policy: Type[BasePolicy],
        env: Union[GymEnv, str, None],
        policy_base: Type[BasePolicy],
        learning_rate: Union[float, Callable],
        policy_kwargs: Dict[str, Any] = None,
        tensorboard_log: Optional[str] = None,
        verbose: int = 0,
        device: Union[th.device, str] = "auto",
        support_multi_env: bool = False,
        create_eval_env: bool = False,
        monitor_wrapper: bool = True,
        seed: Optional[int] = None,
        use_sde: bool = False,
        sde_sample_freq: int = -1,
    ):

        if isinstance(policy, str) and policy_base is not None:
            self.policy_class = get_policy_from_name(policy_base, policy)
        else:
            self.policy_class = policy

        self.device = get_device(device)
        if verbose > 0:
            print(f"Using {self.device} device")

        self.env = None  # type: Optional[GymEnv]
        # get VecNormalize object if needed
        self._vec_normalize_env = unwrap_vec_normalize(env)
        self.verbose = verbose
        self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs
        self.observation_space = None  # type: Optional[gym.spaces.Space]
        self.action_space = None  # type: Optional[gym.spaces.Space]
        self.n_envs = None
        self.num_timesteps = 0
        # Used for updating schedules
        self._total_timesteps = 0
        self.eval_env = None
        self.seed = seed
        self.action_noise = None  # type: Optional[ActionNoise]
        self.start_time = None
        self.policy = None
        self.learning_rate = learning_rate
        self.tensorboard_log = tensorboard_log
        self.lr_schedule = None  # type: Optional[Callable]
        self._last_obs = None  # type: Optional[np.ndarray]
        # When using VecNormalize:
        self._last_original_obs = None  # type: Optional[np.ndarray]
        self._episode_num = 0
        # Used for gSDE only
        self.use_sde = use_sde
        self.sde_sample_freq = sde_sample_freq
        # Track the training progress remaining (from 1 to 0)
        # this is used to update the learning rate
        self._current_progress_remaining = 1
        # Buffers for logging
        self.ep_info_buffer = None  # type: Optional[deque]
        self.ep_success_buffer = None  # type: Optional[deque]
        # For logging
        self._n_updates = 0  # type: int

        # Create and wrap the env if needed
        if env is not None:
            if isinstance(env, str):
                if create_eval_env:
                    self.eval_env = maybe_make_env(env, monitor_wrapper,
                                                   self.verbose)

            env = maybe_make_env(env, monitor_wrapper, self.verbose)
            env = self._wrap_env(env)

            self.observation_space = env.observation_space
            self.action_space = env.action_space
            self.n_envs = env.num_envs
            self.env = env

            if not support_multi_env and self.n_envs > 1:
                raise ValueError(
                    "Error: the model does not support multiple envs; it requires "
                    "a single vectorized environment.")

        if self.use_sde and not isinstance(self.observation_space,
                                           gym.spaces.Box):
            raise ValueError(
                "generalized State-Dependent Exploration (gSDE) can only be used with continuous actions."
            )
Ejemplo n.º 15
0
    def __init__(self,
                 policy: Type[BasePolicy],
                 env: Union[GymEnv, str],
                 policy_base: Type[BasePolicy],
                 learning_rate: Union[float, Callable],
                 policy_kwargs: Dict[str, Any] = None,
                 verbose: int = 0,
                 device: Union[th.device, str] = 'auto',
                 support_multi_env: bool = False,
                 create_eval_env: bool = False,
                 monitor_wrapper: bool = True,
                 seed: Optional[int] = None,
                 use_sde: bool = False,
                 sde_sample_freq: int = -1,
                 tensorboard_log = None):

        if isinstance(policy, str) and policy_base is not None:
            self.policy_class = get_policy_from_name(policy_base, policy)
        else:
            self.policy_class = policy

        self.device = get_device(device)
        if verbose > 0:
            print(f"Using {self.device} device")

        self.env = None  # type: Optional[GymEnv]
        # get VecNormalize object if needed
        self._vec_normalize_env = unwrap_vec_normalize(env)
        self.verbose = verbose
        self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs
        self.observation_space = None  # type: Optional[gym.spaces.Space]
        self.action_space = None  # type: Optional[gym.spaces.Space]
        self.n_envs = None
        self.num_timesteps = 0
        self.eval_env = None
        self.seed = seed
        self.action_noise = None  # type: Optional[ActionNoise]
        self.start_time = None
        self.policy = None
        self.learning_rate = learning_rate
        self.lr_schedule = None  # type: Optional[Callable]
        self._last_obs = None  # type: Optional[np.ndarray]
        # When using VecNormalize:
        self._last_original_obs = None  # type: Optional[np.ndarray]
        self._episode_num = 0
        # Used for SDE only
        self.use_sde = use_sde
        self.sde_sample_freq = sde_sample_freq
        # Track the training progress (from 1 to 0)
        # this is used to update the learning rate
        self._current_progress = 1
        # Buffers for logging
        self.ep_info_buffer = None  # type: Optional[deque]
        self.ep_success_buffer = None  # type: Optional[deque]
        # For logging
        self._n_updates = 0  # type: int
        self.tensorboard_log = tensorboard_log
        # Create and wrap the env if needed
        if env is not None:
            if isinstance(env, str):
                if create_eval_env:
                    eval_env = gym.make(env)
                    if monitor_wrapper:
                        eval_env = Monitor(eval_env, filename=None)
                    self.eval_env = DummyVecEnv([lambda: eval_env])
                if self.verbose >= 1:
                    print("Creating environment from the given name, wrapped in a DummyVecEnv.")

                env = gym.make(env)
                if monitor_wrapper:
                    env = Monitor(env, filename=None)
                env = DummyVecEnv([lambda: env])

            env = self._wrap_env(env)

            self.observation_space = env.observation_space
            self.action_space = env.action_space
            self.n_envs = env.num_envs
            self.env = env

            if not support_multi_env and self.n_envs > 1:
                raise ValueError("Error: the model does not support multiple envs requires a single vectorized"
                                 " environment.")

        # -------------------- logging/tensorboard -------------------- #
        output_formats = [HumanOutputFormat(sys.stdout)]
        if not self.tensorboard_log is None:
            output_formats.append(TensorBoardOutputFormat(self.tensorboard_log))
        self.logger = Logger(folder=None, output_formats=output_formats)
Ejemplo n.º 16
0
    def _load_from_file(load_path: str, load_data: bool = True) -> (Tuple[Optional[Dict[str, Any]],
                                                                          Optional[TensorDict],
                                                                          Optional[TensorDict]]):
        """ Load model data from a .zip archive

        :param load_path: Where to load the model from
        :param load_data: Whether we should load and return data
            (class parameters). Mainly used by 'load_parameters' to only load model parameters (weights)
        :return: (dict),(dict),(dict) Class parameters, model state_dicts (dict of state_dict)
            and dict of extra tensors
        """
        # Check if file exists if load_path is a string
        if isinstance(load_path, str):
            if not os.path.exists(load_path):
                if os.path.exists(load_path + ".zip"):
                    load_path += ".zip"
                else:
                    raise ValueError(f"Error: the file {load_path} could not be found")

        # set device to cpu if cuda is not available
        device = get_device()

        # Open the zip archive and load data
        try:
            with zipfile.ZipFile(load_path, "r") as archive:
                namelist = archive.namelist()
                # If data or parameters is not in the
                # zip archive, assume they were stored
                # as None (_save_to_file_zip allows this).
                data = None
                tensors = None
                params = {}

                if "data" in namelist and load_data:
                    # Load class parameters and convert to string
                    json_data = archive.read("data").decode()
                    data = json_to_data(json_data)

                if "tensors.pth" in namelist and load_data:
                    # Load extra tensors
                    with archive.open('tensors.pth', mode="r") as tensor_file:
                        # File has to be seekable, but opt_param_file is not, so load in BytesIO first
                        # fixed in python >= 3.7
                        file_content = io.BytesIO()
                        file_content.write(tensor_file.read())
                        # go to start of file
                        file_content.seek(0)
                        # load the parameters with the right ``map_location``
                        tensors = th.load(file_content, map_location=device)

                # check for all other .pth files
                other_files = [file_name for file_name in namelist if
                               os.path.splitext(file_name)[1] == ".pth" and file_name != "tensors.pth"]
                # if there are any other files which end with .pth and aren't "params.pth"
                # assume that they each are optimizer parameters
                if len(other_files) > 0:
                    for file_path in other_files:
                        with archive.open(file_path, mode="r") as opt_param_file:
                            # File has to be seekable, but opt_param_file is not, so load in BytesIO first
                            # fixed in python >= 3.7
                            file_content = io.BytesIO()
                            file_content.write(opt_param_file.read())
                            # go to start of file
                            file_content.seek(0)
                            # load the parameters with the right ``map_location``
                            params[os.path.splitext(file_path)[0]] = th.load(file_content, map_location=device)

        except zipfile.BadZipFile:
            # load_path wasn't a zip file
            raise ValueError(f"Error: the file {load_path} wasn't a zip-file")
        return data, params, tensors
                        type=str,
                        default="BreakoutNoFrameskip-v4",
                        help='environment ID')
    parser.add_argument('-e',
                        '--epochs',
                        help='Number of epochs to train for',
                        default=5,
                        type=int)
    parser.add_argument('-s',
                        '--seed',
                        help="Random seed",
                        default=0,
                        type=int)
    args = parser.parse_args()

    device = get_device()
    print(f"Using {device} device.")

    seed = args.seed
    random.seed(seed)
    np.random.seed(seed)
    th.manual_seed(seed)
    th.backends.cudnn.deterministic = True
    th.backends.cudnn.benchmark = False
    set_random_seed(seed)

    env_id = args.env
    if 'maze' not in env_id.lower():
        raise Exception(f"env {env_id} is not a maze env")
    env = gym.make(env_id)
    print(f"Created env with obs.shape = {env.reset().shape}.")
Ejemplo n.º 18
0
    def __init__(
        self,
        policy: Type[BasePolicy],
        env: Union[GymEnv, str, None],
        policy_base: Type[BasePolicy],
        learning_rate: Union[float, Schedule],
        policy_kwargs: Optional[Dict[str, Any]] = None,
        tensorboard_log: Optional[str] = None,
        verbose: int = 0,
        device: Union[th.device, str] = "auto",
        support_multi_env: bool = False,
        create_eval_env: bool = False,
        monitor_wrapper: bool = True,
        seed: Optional[int] = None,
        use_sde: bool = False,
        sde_sample_freq: int = -1,
        supported_action_spaces: Optional[Tuple[gym.spaces.Space, ...]] = None,
    ):

        if isinstance(policy, str) and policy_base is not None:
            self.policy_class = get_policy_from_name(policy_base, policy)
        else:
            self.policy_class = policy

        self.device = get_device(device)
        if verbose > 0:
            print(f"Using {self.device} device")

        self.env = None  # type: Optional[GymEnv]
        # get VecNormalize object if needed
        self._vec_normalize_env = unwrap_vec_normalize(env)
        self.verbose = verbose
        self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs
        self.observation_space = None  # type: Optional[gym.spaces.Space]
        self.action_space = None  # type: Optional[gym.spaces.Space]
        self.n_envs = None
        self.num_timesteps = 0
        # Used for updating schedules
        self._total_timesteps = 0
        # Used for computing fps, it is updated at each call of learn()
        self._num_timesteps_at_start = 0
        self.eval_env = None
        self.seed = seed
        self.action_noise = None  # type: Optional[ActionNoise]
        self.start_time = None
        self.policy = None
        self.learning_rate = learning_rate
        self.tensorboard_log = tensorboard_log
        self.lr_schedule = None  # type: Optional[Schedule]
        self._last_obs = None  # type: Optional[Union[np.ndarray, Dict[str, np.ndarray]]]
        self._last_episode_starts = None  # type: Optional[np.ndarray]
        # When using VecNormalize:
        self._last_original_obs = None  # type: Optional[Union[np.ndarray, Dict[str, np.ndarray]]]
        self._episode_num = 0
        # Used for gSDE only
        self.use_sde = use_sde
        self.sde_sample_freq = sde_sample_freq
        # Track the training progress remaining (from 1 to 0)
        # this is used to update the learning rate
        self._current_progress_remaining = 1
        # Buffers for logging
        self.ep_info_buffer = None  # type: Optional[deque]
        self.ep_success_buffer = None  # type: Optional[deque]
        # For logging (and TD3 delayed updates)
        self._n_updates = 0  # type: int
        # The logger object
        self._logger = None  # type: Logger
        # Whether the user passed a custom logger or not
        self._custom_logger = False

        # Create and wrap the env if needed
        if env is not None:
            if isinstance(env, str):
                if create_eval_env:
                    self.eval_env = maybe_make_env(env, self.verbose)

            env = maybe_make_env(env, self.verbose)
            env = self._wrap_env(env, self.verbose, monitor_wrapper)

            self.observation_space = env.observation_space
            self.action_space = env.action_space
            self.n_envs = env.num_envs
            self.env = env

            if supported_action_spaces is not None:
                assert isinstance(
                    self.action_space, supported_action_spaces
                ), (f"The algorithm only supports {supported_action_spaces} as action spaces "
                    f"but {self.action_space} was provided")

            if not support_multi_env and self.n_envs > 1:
                raise ValueError(
                    "Error: the model does not support multiple envs; it requires "
                    "a single vectorized environment.")

            # Catch common mistake: using MlpPolicy/CnnPolicy instead of MultiInputPolicy
            if policy in ["MlpPolicy", "CnnPolicy"] and isinstance(
                    self.observation_space, gym.spaces.Dict):
                raise ValueError(
                    f"You must use `MultiInputPolicy` when working with dict observation space, not {policy}"
                )

            if self.use_sde and not isinstance(self.action_space,
                                               gym.spaces.Box):
                raise ValueError(
                    "generalized State-Dependent Exploration (gSDE) can only be used with continuous actions."
                )
    def __init__(self,
                 net_arch: Dict[str, List[Tuple[nn.Module, int]]],
                 activation_fn: Type[nn.Module],
                 gnn_for_values=False,
                 use_sibling_relations: bool = False,
                 drop_body_nodes: bool = True,
                 embedding_option=EmbeddingOption.SHARED,
                 device: Union[torch.device, str] = "auto",
                 task_name: str = None,
                 xml_name: str = None,
                 xml_assets_path: Path = None):
        '''
        TODO add documentation
        Parameters:
            net_arch:
                Specifies the network architecture. The network consists of four parts:
                First we have two parts that make out the shared network. This takes as
                input the observations mapped to the node embedding space.
                The mapping is done based on the group a node belongs to (hips, feet, ankles, etc.).
                Because this mapping results in differently sized node features (e.g. ankles
                may have more node features than feet) the first part of the shared network
                is called the input model, which produces a fixed-size node embedding vector
                for all nodes regardless of their group.
                The second part of the shared network is a GNN which is called the propagation model.
                It takes the fixed-size embedding vectors and the adjacency matrix and outputs the new
                node embeddings.
                Afterwards we have two seperate networks, the value model and policy model.
                Both take the new node embeddings and output a latent representation for the policy mean
                or the value scalar
                The network architecture is provided as a dictionary of lists with four keys
                corresponding to the four parts of the network as described above.
                Each list is a list of tuples of type (nn.Module, int) where the first element
                is the layer class that should be used and the second element is the output
                size of this layer.
                For exmaple:
                net_arch = {
                    "input": [
                        (nn.Linear, 8)
                    ],
                    "propagate": [
                        (GCNConv, 12),
                        (nn.Linear, 16),
                        (GCNConv, 12)
                    ],
                    "policy": [
                        (nn.Linear, 16)
                    ],
                    "value": [
                        (nn.Linear, 16)
                    ]
                }
        '''
        super(NerveNetGNN_V0, self).__init__()

        self.task_name = task_name
        self.xml_name = xml_name
        self.xml_assets_path = xml_assets_path
        self.device = get_device(device)
        self.gnn_for_values = gnn_for_values

        self.info = parse_mujoco_graph(task_name=self.task_name,
                                       xml_name=self.xml_name,
                                       xml_assets_path=self.xml_assets_path,
                                       embedding_option=embedding_option)
        self.info["static_input_mapping"] = {}
        # Notes on edge attributes:
        # using one hot encoding leads to num_edge_features != 1
        # officially this is supported for graph data types.
        # However, depending on the type of GNN used, the edge attributes are
        # interpreted not as attributes but as weights.
        # Hence, they can't be of arbitrary shape (must be [num_edges, 1]) and
        # should be somewhat meaningfully be interpretable as weight factors.
        # This is not the case for attributes produced by the following function!
        # Ergo, we should not use them!
        self.edge_index, self.edge_attr = relation_matrix_to_adjacency_matrix(
            self.info["relation_matrix"],
            self_loop=True
        )
        self.edge_index = self.edge_index.to(self.device)
        self.edge_attr = self.edge_attr.to(self.device)
        self.static_node_attr, self.static_node_attr_mask = get_static_node_attributes(
            self.info["static_input_mapping"],
            self.info["num_nodes"])
        self.update_masks, self.observation_mask = get_update_masks(self.info["obs_input_mapping"],
                                                                    self.static_node_attr_mask,
                                                                    self.static_node_attr.shape,
                                                                    self.info["input_type_dict"])

        self.shared_input_nets = {}
        shared_net, policy_net, value_net = [], [], []
        # Layer sizes of the network that only belongs to the policy network
        policy_only_layers = []
        # Layer sizes of the network that only belongs to the value network
        value_only_layers = []

        assert "input" in net_arch, "An input model must be specified in the net_arch attribute"
        assert "propagate" in net_arch, "A propagation model must be specified in the net_arch attribute"
        assert "policy" in net_arch, "A policy model must be specified in the net_arch attribute"
        assert "value" in net_arch, "A value model must be specified in the net_arch attribute"

        # from here on we build the network
        # first we build the input model, where each group of nodes gets
        # its own instance of the input model
        for group_name, (_, attribute_mask) in self.update_masks.items():
            shared_input_layers = []
            last_layer_dim_input = len(attribute_mask)
            if last_layer_dim_input > 0:
                for layer_class, layer_size in net_arch["input"]:
                    shared_input_layers.append(layer_class(
                        last_layer_dim_input, layer_size))
                    shared_input_layers.append(activation_fn())
                    last_layer_dim_input = layer_size
            else:
                shared_input_layers.append(nn.Identity())
                last_layer_dim_input = net_arch["input"][-1][1]

            self.shared_input_nets[group_name] = nn.Sequential(
                *shared_input_layers).to(self.device)

        # max_static_feature_dim = self.static_node_attr.shape[1]
        # max_obs_feature_dim = max(
        #     [len(l) for l in self.info["obs_input_mapping"].values()])
        # last_layer_dim_input = max_obs_feature_dim + max_static_feature_dim

        self.last_layer_dim_input = last_layer_dim_input
        last_layer_dim_shared = last_layer_dim_input

        # Iterate through the shared layers and build the shared parts of the network
        # only the shared network may have GCN convolutions
        for layer_class, layer_size in net_arch["propagate"]:
            # TODO: give layer a meaningful name
            if layer_class == GCNConv:
                # for GCN Conv we need an additional parameter for the constructor
                shared_net.append(layer_class(last_layer_dim_shared,
                                              layer_size,
                                              # we already added self_loops ourselves
                                              add_self_loops=False).to(self.device))
            elif layer_class == NerveNetConv:
                shared_net.append(layer_class(last_layer_dim_shared,
                                              layer_size,
                                              self.update_masks, device=device).to(self.device))
            else:
                shared_net.append(layer_class(last_layer_dim_shared,
                                              layer_size).to(self.device))
            shared_net.append(activation_fn())
            last_layer_dim_shared = layer_size

        # Build the non-shared part of the network

        # in the shared network we use GCN convolutions,
        # which means last_layer_dim_shared is the number of
        # dimensions we have for every single node
        last_layer_dim_pi = self.info["num_nodes"] * last_layer_dim_shared
        if self.gnn_for_values:
            last_layer_dim_vf = self.info["num_nodes"] * last_layer_dim_shared
        else:
            last_layer_dim_vf = self.info["num_nodes"] * \
                self.last_layer_dim_input

        for layer_class, layer_size in net_arch["policy"]:
            policy_net.append(layer_class(
                last_layer_dim_pi, layer_size).to(self.device))
            policy_net.append(activation_fn().to(self.device))
            last_layer_dim_pi = layer_size
        policy_net.append(nn.Linear(last_layer_dim_pi, len(
            self.info["output_list"])).to(self.device))

        for layer_class, layer_size in net_arch["value"]:
            value_net.append(layer_class(
                last_layer_dim_vf, layer_size).to(self.device))
            value_net.append(activation_fn().to(self.device))
            last_layer_dim_vf = layer_size
        value_net.append(nn.Linear(last_layer_dim_vf, 1).to(self.device))

        # Save dim, used to create the distributions
        self.latent_dim_pi = last_layer_dim_pi
        self.latent_dim_vf = last_layer_dim_vf

        # Create networks
        # If the list of layers is empty, the network will just act as an Identity module
        self.shared_net = shared_net
        self.flatten = nn.Flatten()
        self.policy_net = nn.Sequential(*policy_net).to(self.device)
        self.value_net = nn.Sequential(*value_net).to(self.device)
        self.debug = nn.Sequential(
            nn.Linear(self.last_layer_dim_input, 64),
            activation_fn(),
            nn.Linear(64, 64),
            activation_fn()
        )
Ejemplo n.º 20
0
    def __init__(
        self,
        observation_space: gym.spaces.Space,
        features_dim: int = 128,
        obs_per_timestep: int = 5,
        num_transformer_units: int = 2,
        attention_dim: int = 32,
        num_heads: int = 2,
        head_dim: int = 16,
        position_wise_mlp_dim: int = 32,
        init_gru_gate_bias: float = 2.0,
        activation_fn: Type[nn.Module] = th.nn.ReLU,
        device: Union[th.device, str] = "auto",
    ):
        super(TransformerExtractor, self).__init__()
        self.single_obs_dim = observation_space.shape[1]
        self.features_dim = features_dim
        self.input_dim = 64
        self.device = get_device(device)
        self.num_transformer_units = num_transformer_units
        self.attention_dim = attention_dim
        self.num_heads = num_heads,
        self.head_dim = head_dim,
        self.position_wise_mlp_dim = position_wise_mlp_dim,
        self.init_gru_gate_bias = init_gru_gate_bias
        self.obs_per_timestep = obs_per_timestep  # number of obs per timesteps

        self.linear_layer = FullyConnected(in_size=self.single_obs_dim,
                                           out_size=self.attention_dim).to(
                                               self.device)

        transformer_layers = []

        for i in range(self.num_transformer_units):
            # RelativeMultiHeadAttention part.
            MHA_layer = SkipConnection(
                MultiHeadAttention(in_dim=self.attention_dim,
                                   out_dim=self.attention_dim,
                                   num_heads=num_heads,
                                   head_dim=head_dim,
                                   input_layernorm=True,
                                   output_activation=nn.ReLU,
                                   device=self.device),
                fan_in_layer=GRUGate(self.attention_dim,
                                     init_gru_gate_bias,
                                     device=self.device))

            # Position-wise MultiLayerPerceptron part.
            E_layer = SkipConnection(nn.Sequential(
                th.nn.LayerNorm(self.attention_dim),
                FullyConnected(in_size=self.attention_dim,
                               out_size=position_wise_mlp_dim,
                               use_bias=False,
                               activation_fn=nn.ReLU),
                FullyConnected(in_size=position_wise_mlp_dim,
                               out_size=self.attention_dim,
                               use_bias=False,
                               activation_fn=nn.ReLU)),
                                     fan_in_layer=GRUGate(self.attention_dim,
                                                          init_gru_gate_bias,
                                                          device=self.device))

            # Build a list of all attanlayers in order.
            transformer_layers.extend([MHA_layer, E_layer])

        transformer_layers.append(th.nn.Flatten())
        transformer_layers.append(
            nn.Linear(self.input_dim * self.attention_dim, self.features_dim))

        self.transformer = nn.Sequential(*transformer_layers).to(self.device)

        print(
            "Swarmformer # trainable paramaters",
            sum(p.numel() for p in self.transformer.parameters()
                if p.requires_grad))
Ejemplo n.º 21
0
def test_save_load(tmp_path, model_class):
    """
    Test if 'save' and 'load' saves and loads model correctly
    and if 'load_parameters' and 'get_policy_parameters' work correctly

    ''warning does not test function of optimizer parameter load

    :param model_class: (BaseAlgorithm) A RL model
    """

    env = DummyVecEnv([lambda: select_env(model_class)])

    # create model
    model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), verbose=1)
    model.learn(total_timesteps=500, eval_freq=250)

    env.reset()
    observations = np.concatenate([env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)

    # Get dictionary of current parameters
    params = deepcopy(model.policy.state_dict())

    # Modify all parameters to be random values
    random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items())

    # Update model parameters with the new random values
    model.policy.load_state_dict(random_params)

    new_params = model.policy.state_dict()
    # Check that all params are different now
    for k in params:
        assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected."

    params = new_params

    # get selected actions
    selected_actions, _ = model.predict(observations, deterministic=True)

    # Check
    model.save(tmp_path / "test_save.zip")
    del model

    # Check if the model loads as expected for every possible choice of device:
    for device in ["auto", "cpu", "cuda"]:
        model = model_class.load(str(tmp_path / "test_save.zip"), env=env, device=device)

        # check if the model was loaded to the correct device
        assert model.device.type == get_device(device).type
        assert model.policy.device.type == get_device(device).type

        # check if params are still the same after load
        new_params = model.policy.state_dict()

        # Check that all params are the same as before save load procedure now
        for key in params:
            assert new_params[key].device.type == get_device(device).type
            assert th.allclose(
                params[key].to("cpu"), new_params[key].to("cpu")
            ), "Model parameters not the same after save and load."

        # check if model still selects the same actions
        new_selected_actions, _ = model.predict(observations, deterministic=True)
        assert np.allclose(selected_actions, new_selected_actions, 1e-4)

        # check if learn still works
        model.learn(total_timesteps=1000, eval_freq=500)

        del model

    # clear file from os
    os.remove(tmp_path / "test_save.zip")
Ejemplo n.º 22
0
def load_from_zip_file(
    load_path: Union[str, pathlib.Path, io.BufferedIOBase],
    load_data: bool = True,
    custom_objects: Optional[Dict[str, Any]] = None,
    device: Union[th.device, str] = "auto",
    verbose: int = 0,
    print_system_info: bool = False,
) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict],
            Optional[TensorDict]]):
    """
    Load model data from a .zip archive

    :param load_path: Where to load the model from
    :param load_data: Whether we should load and return data
        (class parameters). Mainly used by 'load_parameters' to only load model parameters (weights)
    :param custom_objects: Dictionary of objects to replace
        upon loading. If a variable is present in this dictionary as a
        key, it will not be deserialized and the corresponding item
        will be used instead. Similar to custom_objects in
        ``keras.models.load_model``. Useful when you have an object in
        file that can not be deserialized.
    :param device: Device on which the code should run.
    :param verbose: Verbosity level, 0 means only warnings, 2 means debug information.
    :param print_system_info: Whether to print or not the system info
        about the saved model.
    :return: Class parameters, model state_dicts (aka "params", dict of state_dict)
        and dict of pytorch variables
    """
    load_path = open_path(load_path, "r", verbose=verbose, suffix="zip")

    # set device to cpu if cuda is not available
    device = get_device(device=device)

    # Open the zip archive and load data
    try:
        with zipfile.ZipFile(load_path) as archive:
            namelist = archive.namelist()
            # If data or parameters is not in the
            # zip archive, assume they were stored
            # as None (_save_to_file_zip allows this).
            data = None
            pytorch_variables = None
            params = {}

            # Debug system info first
            if print_system_info:
                if "system_info.txt" in namelist:
                    print("== SAVED MODEL SYSTEM INFO ==")
                    print(archive.read("system_info.txt").decode())
                else:
                    warnings.warn(
                        "The model was saved with SB3 <= 1.2.0 and thus cannot print system information.",
                        UserWarning,
                    )

            if "data" in namelist and load_data:
                # Load class parameters that are stored
                # with either JSON or pickle (not PyTorch variables).
                json_data = archive.read("data").decode()
                data = json_to_data(json_data, custom_objects=custom_objects)

            # Check for all .pth files and load them using th.load.
            # "pytorch_variables.pth" stores PyTorch variables, and any other .pth
            # files store state_dicts of variables with custom names (e.g. policy, policy.optimizer)
            pth_files = [
                file_name for file_name in namelist
                if os.path.splitext(file_name)[1] == ".pth"
            ]
            for file_path in pth_files:
                with archive.open(file_path, mode="r") as param_file:
                    # File has to be seekable, but param_file is not, so load in BytesIO first
                    # fixed in python >= 3.7
                    file_content = io.BytesIO()
                    file_content.write(param_file.read())
                    # go to start of file
                    file_content.seek(0)
                    # Load the parameters with the right ``map_location``.
                    # Remove ".pth" ending with splitext
                    th_object = th.load(file_content, map_location=device)
                    # "tensors.pth" was renamed "pytorch_variables.pth" in v0.9.0, see PR #138
                    if file_path == "pytorch_variables.pth" or file_path == "tensors.pth":
                        # PyTorch variables (not state_dicts)
                        pytorch_variables = th_object
                    else:
                        # State dicts. Store into params dictionary
                        # with same name as in .zip file (without .pth)
                        params[os.path.splitext(file_path)[0]] = th_object
    except zipfile.BadZipFile:
        # load_path wasn't a zip file
        raise ValueError(f"Error: the file {load_path} wasn't a zip-file")
    return data, params, pytorch_variables
Ejemplo n.º 23
0
    def __init__(
        self,
        observation_space: gym.Space,
        action_space: gym.Space,
        *,
        policy_class: Type[policies.BasePolicy] = base.FeedForward32Policy,
        policy_kwargs: Optional[Mapping[str, Any]] = None,
        expert_data: Union[
            types.TransitionsMinimal,
            datasets.Dataset[types.TransitionsMinimal],
            None,
        ] = None,
        batch_size: int = 32,
        optimizer_cls: Type[th.optim.Optimizer] = th.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None,
        ent_weight: float = 1e-3,
        l2_weight: float = 0.0,
        device: Union[str, th.device] = "auto",
    ):
        """Behavioral cloning (BC).

        Recovers a policy via supervised learning on a Dataset of observation-action
        pairs.

        Args:
            observation_space: the observation space of the environment.
            action_space: the action space of the environment.
            policy_class: used to instantiate imitation policy.
            policy_kwargs: keyword arguments passed to policy's constructor.
            expert_data: If not None, then immediately call
                  `self.set_expert_dataset(expert_data)` during initialization.
            batch_size: batch size used for training.
            optimizer_cls: optimiser to use for supervised training.
            optimizer_kwargs: keyword arguments, excluding learning rate and
                  weight decay, for optimiser construction.
            ent_weight: scaling applied to the policy's entropy regularization.
            l2_weight: scaling applied to the policy's L2 regularization.
            device: name/identity of device to place policy on.
        """
        if optimizer_kwargs:
            if "weight_decay" in optimizer_kwargs:
                raise ValueError(
                    "Use the parameter l2_weight insteand of weight_decay."
                )

        self.action_space = action_space
        self.observation_space = observation_space
        self.policy_class = policy_class
        self.device = device = utils.get_device(device)
        self.policy_kwargs = dict(
            observation_space=self.observation_space,
            action_space=self.action_space,
            lr_schedule=ConstantLRSchedule(),
            device=self.device,
        )
        self.policy_kwargs.update(policy_kwargs or {})
        self.device = utils.get_device(device)

        self.policy = self.policy_class(**self.policy_kwargs).to(
            self.device
        )  # pytype: disable=not-instantiable
        optimizer_kwargs = optimizer_kwargs or {}
        self.optimizer = optimizer_cls(self.policy.parameters(), **optimizer_kwargs)

        assert batch_size >= 1
        self.batch_size = batch_size
        self.expert_dataset: Optional[datasets.Dataset[types.TransitionsMinimal]] = None
        self.ent_weight = ent_weight
        self.l2_weight = l2_weight

        if expert_data is not None:
            self.set_expert_dataset(expert_data)
    def __init__(
        self,
        attacker_policy: Type[BasePolicy],
        defender_policy: Type[BasePolicy],
        env: Union[GymEnv, str, None],
        policy_base: Type[BasePolicy],
        attacker_learning_rate: Union[float, Schedule],
        defender_learning_rate: Union[float, Schedule],
        attacker_policy_kwargs: Dict[str, Any] = None,
        defender_policy_kwargs: Dict[str, Any] = None,
        tensorboard_log: Optional[str] = None,
        device: Union[th.device, str] = "auto",
        seed: Optional[int] = None,
        train_mode: TrainMode = TrainMode.TRAIN_ATTACKER,
        attacker_agent_config: AgentConfig = None,
        defender_agent_config: AgentConfig = None
    ):
        self.attacker_agent_config = attacker_agent_config
        self.defender_agent_config = defender_agent_config
        try:
            self.tensorboard_writer = SummaryWriter(self.attacker_agent_config.tensorboard_dir)
            self.tensorboard_writer.add_hparams(self.attacker_agent_config.hparams_dict(), {})
        except:
            print("error creating tensorboard writer")
        # try:
        #     self.tensorboard_writer = SummaryWriter(self.attacker_agent_config.tensorboard_dir)
        #     self.tensorboard_writer.add_hparams(self.attacker_agent_config.hparams_dict(), {})
        # except:
        #     print("error creating tensorboard writer")

        if isinstance(attacker_policy, str) and policy_base is not None:
            self.attacker_policy_class = get_policy_from_name(policy_base, attacker_policy)
        else:
            self.attacker_policy_class = attacker_policy

        if isinstance(defender_policy, str) and policy_base is not None:
            self.defender_policy_class = get_policy_from_name(policy_base, defender_policy)
        else:
            self.defender_policy_class = defender_policy

        self.device = get_device(device)
        self.env = None
        self._vec_normalize_env = unwrap_vec_normalize(env)
        self.attacker_policy_kwargs = {} if attacker_policy_kwargs is None else attacker_policy_kwargs
        self.defender_policy_kwargs = {} if defender_policy_kwargs is None else defender_policy_kwargs
        self.attacker_observation_space = None
        self.attacker_action_space = None
        self.defender_observation_space = None
        self.defender_action_space = None
        self.n_envs = None
        self.num_timesteps = 0
        # Used for updating schedules
        self._total_timesteps = 0
        self.seed = seed
        self.start_time = None
        self.attacker_policy = None
        self.defender_policy = None
        self.attacker_learning_rate = attacker_learning_rate
        self.defender_learning_rate = defender_learning_rate
        self.tensorboard_log = tensorboard_log
        self._last_obs = None
        self._last_episode_starts = None
        self._last_original_obs = None
        self._last_dones = None
        self._episode_num = 0
        self._current_progress_remaining = 1
        self.ep_info_buffer = None
        self.ep_success_buffer = None
        self._n_updates = 0
        self.train_mode = train_mode
        self.train_result = ExperimentResult()
        self.eval_result = ExperimentResult()
        self.training_start = time.time()

        # Create and wrap the env if needed
        if env is not None:
            env = maybe_make_env(env)
            env = self._wrap_env(env)
            self.attacker_observation_space = env.attacker_observation_space
            self.attacker_action_space = env.attacker_action_space
            self.defender_observation_space = env.defender_observation_space
            self.defender_action_space = env.defender_action_space
            self.n_envs = env.num_envs
            self.env = env
Ejemplo n.º 25
0
def test_save_load(tmp_path, model_class):
    """
    Test if 'save' and 'load' saves and loads model correctly
    and if 'get_parameters' and 'set_parameters' and work correctly.

    ''warning does not test function of optimizer parameter load

    :param model_class: (BaseAlgorithm) A RL model
    """

    env = DummyVecEnv([lambda: select_env(model_class)])

    # create model
    model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), verbose=1)
    model.learn(total_timesteps=500)

    env.reset()
    observations = np.concatenate([env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0)

    # Get parameters of different objects
    # deepcopy to avoid referencing to tensors we are about to modify
    original_params = deepcopy(model.get_parameters())

    # Test different error cases of set_parameters.
    # Test that invalid object names throw errors
    invalid_object_params = deepcopy(original_params)
    invalid_object_params["I_should_not_be_a_valid_object"] = "and_I_am_an_invalid_tensor"
    with pytest.raises(ValueError):
        model.set_parameters(invalid_object_params, exact_match=True)
    with pytest.raises(ValueError):
        model.set_parameters(invalid_object_params, exact_match=False)

    # Test that exact_match catches when something was missed.
    missing_object_params = dict((k, v) for k, v in list(original_params.items())[:-1])
    with pytest.raises(ValueError):
        model.set_parameters(missing_object_params, exact_match=True)

    # Test that exact_match catches when something inside state-dict
    # is missing but we have exact_match.
    missing_state_dict_tensor_params = {}
    for object_name in original_params:
        object_params = {}
        missing_state_dict_tensor_params[object_name] = object_params
        # Skip last item in state-dict
        for k, v in list(original_params[object_name].items())[:-1]:
            object_params[k] = v
    with pytest.raises(RuntimeError):
        # PyTorch load_state_dict throws RuntimeError if strict but
        # invalid state-dict.
        model.set_parameters(missing_state_dict_tensor_params, exact_match=True)

    # Test that parameters do indeed change.
    random_params = {}
    for object_name, params in original_params.items():
        # Do not randomize optimizer parameters (custom layout)
        if "optim" in object_name:
            random_params[object_name] = params
        else:
            # Again, skip the last item in state-dict
            random_params[object_name] = OrderedDict(
                (param_name, th.rand_like(param)) for param_name, param in list(params.items())[:-1]
            )

    # Update model parameters with the new random values
    model.set_parameters(random_params, exact_match=False)

    new_params = model.get_parameters()
    # Check that all params except the final item in each state-dict are different.
    for object_name in original_params:
        # Skip optimizers (no valid comparison with just th.allclose)
        if "optim" in object_name:
            continue
        # state-dicts use ordered dictionaries, so key order
        # is guaranteed.
        last_key = list(original_params[object_name].keys())[-1]
        for k in original_params[object_name]:
            if k == last_key:
                # Should be same as before
                assert th.allclose(
                    original_params[object_name][k], new_params[object_name][k]
                ), "Parameter changed despite not included in the loaded parameters."
            else:
                # Should be different
                assert not th.allclose(
                    original_params[object_name][k], new_params[object_name][k]
                ), "Parameters did not change as expected."

    params = new_params

    # get selected actions
    selected_actions, _ = model.predict(observations, deterministic=True)

    # Check
    model.save(tmp_path / "test_save.zip")
    del model

    # Check if the model loads as expected for every possible choice of device:
    for device in ["auto", "cpu", "cuda"]:
        model = model_class.load(str(tmp_path / "test_save.zip"), env=env, device=device)

        # check if the model was loaded to the correct device
        assert model.device.type == get_device(device).type
        assert model.policy.device.type == get_device(device).type

        # check if params are still the same after load
        new_params = model.get_parameters()

        # Check that all params are the same as before save load procedure now
        for object_name in new_params:
            # Skip optimizers (no valid comparison with just th.allclose)
            if "optim" in object_name:
                continue
            for key in params[object_name]:
                assert new_params[object_name][key].device.type == get_device(device).type
                assert th.allclose(
                    params[object_name][key].to("cpu"), new_params[object_name][key].to("cpu")
                ), "Model parameters not the same after save and load."

        # check if model still selects the same actions
        new_selected_actions, _ = model.predict(observations, deterministic=True)
        assert np.allclose(selected_actions, new_selected_actions, 1e-4)

        # check if learn still works
        model.learn(total_timesteps=500)

        del model

    # clear file from os
    os.remove(tmp_path / "test_save.zip")
Ejemplo n.º 26
0
def load_from_zip_file(
    load_path: Union[str, pathlib.Path, io.BufferedIOBase],
    load_data: bool = True,
    device: Union[th.device, str] = "auto",
    verbose: int = 0,
) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict],
            Optional[TensorDict]]):
    """
    Load model data from a .zip archive

    :param load_path: Where to load the model from
    :param load_data: Whether we should load and return data
        (class parameters). Mainly used by 'load_parameters' to only load model parameters (weights)
    :param device: Device on which the code should run.
    :return: Class parameters, model state_dicts (aka "params", dict of state_dict)
        and dict of pytorch variables
    """
    load_path = open_path(load_path, "r", verbose=verbose, suffix="zip")

    # set device to cpu if cuda is not available
    device = get_device(device=device)

    # Open the zip archive and load data
    try:
        with zipfile.ZipFile(load_path) as archive:
            namelist = archive.namelist()
            # If data or parameters is not in the
            # zip archive, assume they were stored
            # as None (_save_to_file_zip allows this).
            data = None
            pytorch_variables = None
            params = {}

            if "data" in namelist and load_data:
                # Load class parameters that are stored
                # with either JSON or pickle (not PyTorch variables).
                json_data = archive.read("data").decode()
                data = json_to_data(json_data)

            # Check for all .pth files and load them using th.load.
            # "pytorch_variables.pth" stores PyTorch variables, and any other .pth
            # files store state_dicts of variables with custom names (e.g. policy, policy.optimizer)
            pth_files = [
                file_name for file_name in namelist
                if os.path.splitext(file_name)[1] == ".pth"
            ]
            for file_path in pth_files:
                with archive.open(file_path, mode="r") as param_file:
                    # File has to be seekable, but param_file is not, so load in BytesIO first
                    # fixed in python >= 3.7
                    file_content = io.BytesIO()
                    file_content.write(param_file.read())
                    # go to start of file
                    file_content.seek(0)
                    # Load the parameters with the right ``map_location``.
                    # Remove ".pth" ending with splitext
                    th_object = th.load(file_content, map_location=device)
                    # "tensors.pth" was renamed "pytorch_variables.pth" in v0.9.0, see PR #138
                    if file_path == "pytorch_variables.pth" or file_path == "tensors.pth":
                        # PyTorch variables (not state_dicts)
                        pytorch_variables = th_object
                    else:
                        # State dicts. Store into params dictionary
                        # with same name as in .zip file (without .pth)
                        params[os.path.splitext(file_path)[0]] = th_object
    except zipfile.BadZipFile:
        # load_path wasn't a zip file
        raise ValueError(f"Error: the file {load_path} wasn't a zip-file")
    return data, params, pytorch_variables