def __init__( self, observation_space: gym.Space, action_space: gym.Space, policy: Type[policies.BasePolicy], expert_data: Union[Iterable[Mapping], types.TransitionsMinimal, None] = None, optimizer_cls: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None, ent_weight: float = 1e-3, l2_weight: float = 0.0, device: Union[str, th.device] = "auto", ): """Behavioral cloning (BC). Recovers a policy via supervised learning on observation-action Tensor pairs, sampled from a Torch DataLoader or any Iterator that ducktypes `torch.utils.data.DataLoader`. Args: observation_space: the observation space of the environment. action_space: the action space of the environment. policy: the policy to be trained. expert_data: If not None, then immediately call `self.set_expert_data_loader(expert_data)` during initialization. optimizer_cls: optimiser to use for supervised training. optimizer_kwargs: keyword arguments, excluding learning rate and weight decay, for optimiser construction. ent_weight: scaling applied to the policy's entropy regularization. l2_weight: scaling applied to the policy's L2 regularization. device: name/identity of device to place policy on. """ if optimizer_kwargs: if "weight_decay" in optimizer_kwargs: raise ValueError( "Use the parameter l2_weight instead of weight_decay.") self.action_space = action_space self.observation_space = observation_space self.device = device = utils.get_device(device) self.device = utils.get_device(device) self.policy = policy.policy.to(self.device) # pytype: disable=not-instantiable optimizer_kwargs = optimizer_kwargs or {} self.optimizer = optimizer_cls(self.policy.parameters(), **optimizer_kwargs) self.expert_data_loader: Optional[Iterable[Mapping]] = None self.ent_weight = ent_weight self.l2_weight = l2_weight if expert_data is not None: self.set_expert_data_loader(expert_data)
def __init__(self, in_channels: int, out_channels: int, update_masks: Dict[str, Tuple[List[int], int]], device: Union[torch.device, str] = "auto", cached: bool = False, bias: bool = True, **kwargs): kwargs.setdefault('aggr', 'add') super(NerveNetConv, self).__init__(**kwargs) self.in_channels = in_channels self.out_channels = out_channels self.update_masks = update_masks self.use_bias = bias self.cached = cached self.device = get_device(device) self._cached_edge_index = None self._cached_adj_t = None self.update_models_parameter = {} for group_name, _ in update_masks.items(): self.update_models_parameter[group_name] = {} self.update_models_parameter[group_name]["weights"] = Parameter( torch.Tensor(in_channels, out_channels)).to(self.device) if self.use_bias: self.update_models_parameter[group_name]["bias"] = Parameter( torch.Tensor(out_channels)).to(self.device) else: self.update_models_parameter[group_name]["bias"] = None self.reset_parameters()
def load(cls, path: str, device: Union[th.device, str] = "auto") -> "BaseModel": """ Load model from path. :param path: :param device: Device on which the policy should be loaded. :return: """ device = get_device(device) saved_variables = th.load(path, map_location=device) # Allow to load policy saved with older version of SB3 if "sde_net_arch" in saved_variables["data"]: warnings.warn( "sde_net_arch is deprecated, please downgrade to SB3 v1.2.0 if you need such parameter.", DeprecationWarning, ) del saved_variables["data"]["sde_net_arch"] # Create policy object model = cls(**saved_variables["data"]) # pytype: disable=not-instantiable # Load weights model.load_state_dict(saved_variables["state_dict"]) model.to(device) return model
def __init__( self, in_features_dim: gym.spaces.Space, hidden_layer_size: int = 32, n_lstm_cells: int = 16, batch_size: int = 1024, orthogonal_init: bool = False, activation_fn: Type[nn.Module] = nn.Tanh, device: Union[th.device, str] = "gpu", ): super(LSTM, self).__init__() self.device = get_device(device) self.hidden_layer_size = hidden_layer_size self.n_lstm_cells = n_lstm_cells self.lstm = [] self.batch_size = batch_size last_layer_dim_shared = hidden_layer_size first_input_size = in_features_dim self.lstm = nn.LSTM(first_input_size, hidden_layer_size, num_layers=n_lstm_cells) self.linear = nn.Linear(hidden_layer_size, hidden_layer_size) self.hidden = self.init_hidden(self.batch_size)
def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, device: Union[th.device, str] = "auto", features_extractor_class: Type[ BaseFeaturesExtractor] = FlattenExtractor, features_extractor_kwargs: Optional[Dict[str, Any]] = None, features_extractor: Optional[nn.Module] = None, normalize_images: bool = True, optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None, ): super(BaseModel, self).__init__() if optimizer_kwargs is None: optimizer_kwargs = {} if features_extractor_kwargs is None: features_extractor_kwargs = {} self.observation_space = observation_space self.action_space = action_space self.device = get_device(device) self.features_extractor = features_extractor self.normalize_images = normalize_images self.optimizer_class = optimizer_class self.optimizer_kwargs = optimizer_kwargs self.optimizer = None # type: Optional[th.optim.Optimizer] self.features_extractor_class = features_extractor_class self.features_extractor_kwargs = features_extractor_kwargs
def __init__( self, feature_dim: int, net_arch: List[Union[int, Dict[str, List[int]]]], activation_fn: Type[nn.Module], device: Union[th.device, str] = "auto", ): super(MlpExtractor, self).__init__() device = get_device(device) shared_net, policy_net, value_net = [], [], [] policy_only_layers = [] # Layer sizes of the network that only belongs to the policy network value_only_layers = [] # Layer sizes of the network that only belongs to the value network last_layer_dim_shared = feature_dim # Iterate through the shared layers and build the shared parts of the network for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer # TODO: give layer a meaningful name shared_net.append(nn.Linear(last_layer_dim_shared, layer_size)) shared_net.append(activation_fn()) last_layer_dim_shared = layer_size else: if not isinstance(layer, dict): continue #assert isinstance(layer, dict), "Error: the net_arch list can only contain ints and dicts" if "pi" in layer: assert isinstance(layer["pi"], list), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer["pi"] if "vf" in layer: assert isinstance(layer["vf"], list), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer["vf"] break # From here on the network splits up in policy and value network last_layer_dim_pi = last_layer_dim_shared last_layer_dim_vf = last_layer_dim_shared # Build the non-shared part of the network for idx, (pi_layer_size, vf_layer_size) in enumerate(zip_longest(policy_only_layers, value_only_layers)): if pi_layer_size is not None: assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." policy_net.append(nn.Linear(last_layer_dim_pi, pi_layer_size)) policy_net.append(activation_fn()) last_layer_dim_pi = pi_layer_size if vf_layer_size is not None: assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." value_net.append(nn.Linear(last_layer_dim_vf, vf_layer_size)) value_net.append(activation_fn()) last_layer_dim_vf = vf_layer_size # Save dim, used to create the distributions self.latent_dim_pi = last_layer_dim_pi self.latent_dim_vf = last_layer_dim_vf # Create networks # If the list of layers is empty, the network will just act as an Identity module self.shared_net = nn.Sequential(*shared_net).to(device) self.policy_net = nn.Sequential(*policy_net).to(device) self.value_net = nn.Sequential(*value_net).to(device)
def test_predict(model_class, env_id, device): if device == "cuda" and not th.cuda.is_available(): pytest.skip("CUDA not available") if env_id == "CartPole-v1": if model_class in [SAC, TD3]: return elif model_class in [DQN]: return # Test detection of different shapes by the predict method model = model_class("MlpPolicy", env_id, device=device) # Check that the policy is on the right device assert get_device(device).type == model.policy.device.type env = gym.make(env_id) vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)]) obs = env.reset() action, _ = model.predict(obs) assert action.shape == env.action_space.shape assert env.action_space.contains(action) vec_env_obs = vec_env.reset() action, _ = model.predict(vec_env_obs) assert action.shape[0] == vec_env_obs.shape[0]
def device(self) -> th.device: """Infer which device this policy lives on by inspecting its parameters. If it has no parameters, the 'cpu' device is used as a fallback. :return:""" for param in self.parameters(): return param.device return get_device("cpu")
def device(self) -> torch.device: """Infer which device this policy lives on by inspecting its parameters. If it has no parameters, the 'auto' device is used as a fallback. Note: The BasePolicy class for some reason returns cpu and not auto as fallback. However, when we use the FlattenExtractor as FeatureExtractor Network there won't have been any parameters defined from which we could infere the correct device, always leading to the fallback. Which means, we wouldn't be able to use the GPU if we also use FlattenExtractor :return:""" for param in self.parameters(): return param.device return get_device("auto")
def load(cls, path: str, device: Union[th.device, str] = "auto") -> "BaseModel": """ Load model from path. :param path: :param device: Device on which the policy should be loaded. :return: """ device = get_device(device) saved_variables = th.load(path, map_location=device) # Create policy object model = cls(**saved_variables["data"]) # pytype: disable=not-instantiable # Load weights model.load_state_dict(saved_variables["state_dict"]) model.to(device) return model
def reconstruct_policy( policy_path: str, device: Union[th.device, str] = "auto", ) -> policies.BasePolicy: """Reconstruct a saved policy. Args: policy_path: path where `.save_policy()` has been run. device: device on which to load the policy. Returns: policy: policy with reloaded weights. """ policy = th.load(policy_path, map_location=utils.get_device(device)) assert isinstance(policy, policies.BasePolicy) return policy
def reconstruct_trainer( scratch_dir: str, device: Union[th.device, str] = "auto") -> "DAggerTrainer": """Reconstruct trainer from the latest snapshot in some working directory. Args: scratch_dir: path to the working directory created by a previous run of this algorithm. The directory should contain `checkpoint-latest.pt` and `policy-latest.pt` files. device: device on which to load the trainer. Returns: trainer: a reconstructed `DAggerTrainer` with the same state as the previously-saved one. """ checkpoint_path = os.path.join(scratch_dir, "checkpoint-latest.pt") return th.load(checkpoint_path, map_location=utils.get_device(device))
def load(cls, path: str, device: Union[th.device, str] = 'auto') -> 'BasePolicy': """ Load policy from path. :param path: (str) :param device: ( Union[th.device, str]) Device on which the policy should be loaded. :return: (BasePolicy) """ device = get_device(device) saved_variables = th.load(path, map_location=device) # Create policy object model = cls(**saved_variables['data']) # Load weights model.load_state_dict(saved_variables['state_dict']) model.to(device) return model
def __init__( self, policy: Type[BasePolicy], env: Union[GymEnv, str, None], policy_base: Type[BasePolicy], learning_rate: Union[float, Callable], policy_kwargs: Dict[str, Any] = None, tensorboard_log: Optional[str] = None, verbose: int = 0, device: Union[th.device, str] = "auto", support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1, ): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.device = get_device(device) if verbose > 0: print(f"Using {self.device} device") self.env = None # type: Optional[GymEnv] # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None # type: Optional[gym.spaces.Space] self.action_space = None # type: Optional[gym.spaces.Space] self.n_envs = None self.num_timesteps = 0 # Used for updating schedules self._total_timesteps = 0 self.eval_env = None self.seed = seed self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = learning_rate self.tensorboard_log = tensorboard_log self.lr_schedule = None # type: Optional[Callable] self._last_obs = None # type: Optional[np.ndarray] # When using VecNormalize: self._last_original_obs = None # type: Optional[np.ndarray] self._episode_num = 0 # Used for gSDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq # Track the training progress remaining (from 1 to 0) # this is used to update the learning rate self._current_progress_remaining = 1 # Buffers for logging self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging self._n_updates = 0 # type: int # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: self.eval_env = maybe_make_env(env, monitor_wrapper, self.verbose) env = maybe_make_env(env, monitor_wrapper, self.verbose) env = self._wrap_env(env) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if not support_multi_env and self.n_envs > 1: raise ValueError( "Error: the model does not support multiple envs; it requires " "a single vectorized environment.") if self.use_sde and not isinstance(self.observation_space, gym.spaces.Box): raise ValueError( "generalized State-Dependent Exploration (gSDE) can only be used with continuous actions." )
def __init__(self, policy: Type[BasePolicy], env: Union[GymEnv, str], policy_base: Type[BasePolicy], learning_rate: Union[float, Callable], policy_kwargs: Dict[str, Any] = None, verbose: int = 0, device: Union[th.device, str] = 'auto', support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1, tensorboard_log = None): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.device = get_device(device) if verbose > 0: print(f"Using {self.device} device") self.env = None # type: Optional[GymEnv] # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None # type: Optional[gym.spaces.Space] self.action_space = None # type: Optional[gym.spaces.Space] self.n_envs = None self.num_timesteps = 0 self.eval_env = None self.seed = seed self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = learning_rate self.lr_schedule = None # type: Optional[Callable] self._last_obs = None # type: Optional[np.ndarray] # When using VecNormalize: self._last_original_obs = None # type: Optional[np.ndarray] self._episode_num = 0 # Used for SDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq # Track the training progress (from 1 to 0) # this is used to update the learning rate self._current_progress = 1 # Buffers for logging self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging self._n_updates = 0 # type: int self.tensorboard_log = tensorboard_log # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: eval_env = gym.make(env) if monitor_wrapper: eval_env = Monitor(eval_env, filename=None) self.eval_env = DummyVecEnv([lambda: eval_env]) if self.verbose >= 1: print("Creating environment from the given name, wrapped in a DummyVecEnv.") env = gym.make(env) if monitor_wrapper: env = Monitor(env, filename=None) env = DummyVecEnv([lambda: env]) env = self._wrap_env(env) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if not support_multi_env and self.n_envs > 1: raise ValueError("Error: the model does not support multiple envs requires a single vectorized" " environment.") # -------------------- logging/tensorboard -------------------- # output_formats = [HumanOutputFormat(sys.stdout)] if not self.tensorboard_log is None: output_formats.append(TensorBoardOutputFormat(self.tensorboard_log)) self.logger = Logger(folder=None, output_formats=output_formats)
def _load_from_file(load_path: str, load_data: bool = True) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict], Optional[TensorDict]]): """ Load model data from a .zip archive :param load_path: Where to load the model from :param load_data: Whether we should load and return data (class parameters). Mainly used by 'load_parameters' to only load model parameters (weights) :return: (dict),(dict),(dict) Class parameters, model state_dicts (dict of state_dict) and dict of extra tensors """ # Check if file exists if load_path is a string if isinstance(load_path, str): if not os.path.exists(load_path): if os.path.exists(load_path + ".zip"): load_path += ".zip" else: raise ValueError(f"Error: the file {load_path} could not be found") # set device to cpu if cuda is not available device = get_device() # Open the zip archive and load data try: with zipfile.ZipFile(load_path, "r") as archive: namelist = archive.namelist() # If data or parameters is not in the # zip archive, assume they were stored # as None (_save_to_file_zip allows this). data = None tensors = None params = {} if "data" in namelist and load_data: # Load class parameters and convert to string json_data = archive.read("data").decode() data = json_to_data(json_data) if "tensors.pth" in namelist and load_data: # Load extra tensors with archive.open('tensors.pth', mode="r") as tensor_file: # File has to be seekable, but opt_param_file is not, so load in BytesIO first # fixed in python >= 3.7 file_content = io.BytesIO() file_content.write(tensor_file.read()) # go to start of file file_content.seek(0) # load the parameters with the right ``map_location`` tensors = th.load(file_content, map_location=device) # check for all other .pth files other_files = [file_name for file_name in namelist if os.path.splitext(file_name)[1] == ".pth" and file_name != "tensors.pth"] # if there are any other files which end with .pth and aren't "params.pth" # assume that they each are optimizer parameters if len(other_files) > 0: for file_path in other_files: with archive.open(file_path, mode="r") as opt_param_file: # File has to be seekable, but opt_param_file is not, so load in BytesIO first # fixed in python >= 3.7 file_content = io.BytesIO() file_content.write(opt_param_file.read()) # go to start of file file_content.seek(0) # load the parameters with the right ``map_location`` params[os.path.splitext(file_path)[0]] = th.load(file_content, map_location=device) except zipfile.BadZipFile: # load_path wasn't a zip file raise ValueError(f"Error: the file {load_path} wasn't a zip-file") return data, params, tensors
type=str, default="BreakoutNoFrameskip-v4", help='environment ID') parser.add_argument('-e', '--epochs', help='Number of epochs to train for', default=5, type=int) parser.add_argument('-s', '--seed', help="Random seed", default=0, type=int) args = parser.parse_args() device = get_device() print(f"Using {device} device.") seed = args.seed random.seed(seed) np.random.seed(seed) th.manual_seed(seed) th.backends.cudnn.deterministic = True th.backends.cudnn.benchmark = False set_random_seed(seed) env_id = args.env if 'maze' not in env_id.lower(): raise Exception(f"env {env_id} is not a maze env") env = gym.make(env_id) print(f"Created env with obs.shape = {env.reset().shape}.")
def __init__( self, policy: Type[BasePolicy], env: Union[GymEnv, str, None], policy_base: Type[BasePolicy], learning_rate: Union[float, Schedule], policy_kwargs: Optional[Dict[str, Any]] = None, tensorboard_log: Optional[str] = None, verbose: int = 0, device: Union[th.device, str] = "auto", support_multi_env: bool = False, create_eval_env: bool = False, monitor_wrapper: bool = True, seed: Optional[int] = None, use_sde: bool = False, sde_sample_freq: int = -1, supported_action_spaces: Optional[Tuple[gym.spaces.Space, ...]] = None, ): if isinstance(policy, str) and policy_base is not None: self.policy_class = get_policy_from_name(policy_base, policy) else: self.policy_class = policy self.device = get_device(device) if verbose > 0: print(f"Using {self.device} device") self.env = None # type: Optional[GymEnv] # get VecNormalize object if needed self._vec_normalize_env = unwrap_vec_normalize(env) self.verbose = verbose self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs self.observation_space = None # type: Optional[gym.spaces.Space] self.action_space = None # type: Optional[gym.spaces.Space] self.n_envs = None self.num_timesteps = 0 # Used for updating schedules self._total_timesteps = 0 # Used for computing fps, it is updated at each call of learn() self._num_timesteps_at_start = 0 self.eval_env = None self.seed = seed self.action_noise = None # type: Optional[ActionNoise] self.start_time = None self.policy = None self.learning_rate = learning_rate self.tensorboard_log = tensorboard_log self.lr_schedule = None # type: Optional[Schedule] self._last_obs = None # type: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] self._last_episode_starts = None # type: Optional[np.ndarray] # When using VecNormalize: self._last_original_obs = None # type: Optional[Union[np.ndarray, Dict[str, np.ndarray]]] self._episode_num = 0 # Used for gSDE only self.use_sde = use_sde self.sde_sample_freq = sde_sample_freq # Track the training progress remaining (from 1 to 0) # this is used to update the learning rate self._current_progress_remaining = 1 # Buffers for logging self.ep_info_buffer = None # type: Optional[deque] self.ep_success_buffer = None # type: Optional[deque] # For logging (and TD3 delayed updates) self._n_updates = 0 # type: int # The logger object self._logger = None # type: Logger # Whether the user passed a custom logger or not self._custom_logger = False # Create and wrap the env if needed if env is not None: if isinstance(env, str): if create_eval_env: self.eval_env = maybe_make_env(env, self.verbose) env = maybe_make_env(env, self.verbose) env = self._wrap_env(env, self.verbose, monitor_wrapper) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if supported_action_spaces is not None: assert isinstance( self.action_space, supported_action_spaces ), (f"The algorithm only supports {supported_action_spaces} as action spaces " f"but {self.action_space} was provided") if not support_multi_env and self.n_envs > 1: raise ValueError( "Error: the model does not support multiple envs; it requires " "a single vectorized environment.") # Catch common mistake: using MlpPolicy/CnnPolicy instead of MultiInputPolicy if policy in ["MlpPolicy", "CnnPolicy"] and isinstance( self.observation_space, gym.spaces.Dict): raise ValueError( f"You must use `MultiInputPolicy` when working with dict observation space, not {policy}" ) if self.use_sde and not isinstance(self.action_space, gym.spaces.Box): raise ValueError( "generalized State-Dependent Exploration (gSDE) can only be used with continuous actions." )
def __init__(self, net_arch: Dict[str, List[Tuple[nn.Module, int]]], activation_fn: Type[nn.Module], gnn_for_values=False, use_sibling_relations: bool = False, drop_body_nodes: bool = True, embedding_option=EmbeddingOption.SHARED, device: Union[torch.device, str] = "auto", task_name: str = None, xml_name: str = None, xml_assets_path: Path = None): ''' TODO add documentation Parameters: net_arch: Specifies the network architecture. The network consists of four parts: First we have two parts that make out the shared network. This takes as input the observations mapped to the node embedding space. The mapping is done based on the group a node belongs to (hips, feet, ankles, etc.). Because this mapping results in differently sized node features (e.g. ankles may have more node features than feet) the first part of the shared network is called the input model, which produces a fixed-size node embedding vector for all nodes regardless of their group. The second part of the shared network is a GNN which is called the propagation model. It takes the fixed-size embedding vectors and the adjacency matrix and outputs the new node embeddings. Afterwards we have two seperate networks, the value model and policy model. Both take the new node embeddings and output a latent representation for the policy mean or the value scalar The network architecture is provided as a dictionary of lists with four keys corresponding to the four parts of the network as described above. Each list is a list of tuples of type (nn.Module, int) where the first element is the layer class that should be used and the second element is the output size of this layer. For exmaple: net_arch = { "input": [ (nn.Linear, 8) ], "propagate": [ (GCNConv, 12), (nn.Linear, 16), (GCNConv, 12) ], "policy": [ (nn.Linear, 16) ], "value": [ (nn.Linear, 16) ] } ''' super(NerveNetGNN_V0, self).__init__() self.task_name = task_name self.xml_name = xml_name self.xml_assets_path = xml_assets_path self.device = get_device(device) self.gnn_for_values = gnn_for_values self.info = parse_mujoco_graph(task_name=self.task_name, xml_name=self.xml_name, xml_assets_path=self.xml_assets_path, embedding_option=embedding_option) self.info["static_input_mapping"] = {} # Notes on edge attributes: # using one hot encoding leads to num_edge_features != 1 # officially this is supported for graph data types. # However, depending on the type of GNN used, the edge attributes are # interpreted not as attributes but as weights. # Hence, they can't be of arbitrary shape (must be [num_edges, 1]) and # should be somewhat meaningfully be interpretable as weight factors. # This is not the case for attributes produced by the following function! # Ergo, we should not use them! self.edge_index, self.edge_attr = relation_matrix_to_adjacency_matrix( self.info["relation_matrix"], self_loop=True ) self.edge_index = self.edge_index.to(self.device) self.edge_attr = self.edge_attr.to(self.device) self.static_node_attr, self.static_node_attr_mask = get_static_node_attributes( self.info["static_input_mapping"], self.info["num_nodes"]) self.update_masks, self.observation_mask = get_update_masks(self.info["obs_input_mapping"], self.static_node_attr_mask, self.static_node_attr.shape, self.info["input_type_dict"]) self.shared_input_nets = {} shared_net, policy_net, value_net = [], [], [] # Layer sizes of the network that only belongs to the policy network policy_only_layers = [] # Layer sizes of the network that only belongs to the value network value_only_layers = [] assert "input" in net_arch, "An input model must be specified in the net_arch attribute" assert "propagate" in net_arch, "A propagation model must be specified in the net_arch attribute" assert "policy" in net_arch, "A policy model must be specified in the net_arch attribute" assert "value" in net_arch, "A value model must be specified in the net_arch attribute" # from here on we build the network # first we build the input model, where each group of nodes gets # its own instance of the input model for group_name, (_, attribute_mask) in self.update_masks.items(): shared_input_layers = [] last_layer_dim_input = len(attribute_mask) if last_layer_dim_input > 0: for layer_class, layer_size in net_arch["input"]: shared_input_layers.append(layer_class( last_layer_dim_input, layer_size)) shared_input_layers.append(activation_fn()) last_layer_dim_input = layer_size else: shared_input_layers.append(nn.Identity()) last_layer_dim_input = net_arch["input"][-1][1] self.shared_input_nets[group_name] = nn.Sequential( *shared_input_layers).to(self.device) # max_static_feature_dim = self.static_node_attr.shape[1] # max_obs_feature_dim = max( # [len(l) for l in self.info["obs_input_mapping"].values()]) # last_layer_dim_input = max_obs_feature_dim + max_static_feature_dim self.last_layer_dim_input = last_layer_dim_input last_layer_dim_shared = last_layer_dim_input # Iterate through the shared layers and build the shared parts of the network # only the shared network may have GCN convolutions for layer_class, layer_size in net_arch["propagate"]: # TODO: give layer a meaningful name if layer_class == GCNConv: # for GCN Conv we need an additional parameter for the constructor shared_net.append(layer_class(last_layer_dim_shared, layer_size, # we already added self_loops ourselves add_self_loops=False).to(self.device)) elif layer_class == NerveNetConv: shared_net.append(layer_class(last_layer_dim_shared, layer_size, self.update_masks, device=device).to(self.device)) else: shared_net.append(layer_class(last_layer_dim_shared, layer_size).to(self.device)) shared_net.append(activation_fn()) last_layer_dim_shared = layer_size # Build the non-shared part of the network # in the shared network we use GCN convolutions, # which means last_layer_dim_shared is the number of # dimensions we have for every single node last_layer_dim_pi = self.info["num_nodes"] * last_layer_dim_shared if self.gnn_for_values: last_layer_dim_vf = self.info["num_nodes"] * last_layer_dim_shared else: last_layer_dim_vf = self.info["num_nodes"] * \ self.last_layer_dim_input for layer_class, layer_size in net_arch["policy"]: policy_net.append(layer_class( last_layer_dim_pi, layer_size).to(self.device)) policy_net.append(activation_fn().to(self.device)) last_layer_dim_pi = layer_size policy_net.append(nn.Linear(last_layer_dim_pi, len( self.info["output_list"])).to(self.device)) for layer_class, layer_size in net_arch["value"]: value_net.append(layer_class( last_layer_dim_vf, layer_size).to(self.device)) value_net.append(activation_fn().to(self.device)) last_layer_dim_vf = layer_size value_net.append(nn.Linear(last_layer_dim_vf, 1).to(self.device)) # Save dim, used to create the distributions self.latent_dim_pi = last_layer_dim_pi self.latent_dim_vf = last_layer_dim_vf # Create networks # If the list of layers is empty, the network will just act as an Identity module self.shared_net = shared_net self.flatten = nn.Flatten() self.policy_net = nn.Sequential(*policy_net).to(self.device) self.value_net = nn.Sequential(*value_net).to(self.device) self.debug = nn.Sequential( nn.Linear(self.last_layer_dim_input, 64), activation_fn(), nn.Linear(64, 64), activation_fn() )
def __init__( self, observation_space: gym.spaces.Space, features_dim: int = 128, obs_per_timestep: int = 5, num_transformer_units: int = 2, attention_dim: int = 32, num_heads: int = 2, head_dim: int = 16, position_wise_mlp_dim: int = 32, init_gru_gate_bias: float = 2.0, activation_fn: Type[nn.Module] = th.nn.ReLU, device: Union[th.device, str] = "auto", ): super(TransformerExtractor, self).__init__() self.single_obs_dim = observation_space.shape[1] self.features_dim = features_dim self.input_dim = 64 self.device = get_device(device) self.num_transformer_units = num_transformer_units self.attention_dim = attention_dim self.num_heads = num_heads, self.head_dim = head_dim, self.position_wise_mlp_dim = position_wise_mlp_dim, self.init_gru_gate_bias = init_gru_gate_bias self.obs_per_timestep = obs_per_timestep # number of obs per timesteps self.linear_layer = FullyConnected(in_size=self.single_obs_dim, out_size=self.attention_dim).to( self.device) transformer_layers = [] for i in range(self.num_transformer_units): # RelativeMultiHeadAttention part. MHA_layer = SkipConnection( MultiHeadAttention(in_dim=self.attention_dim, out_dim=self.attention_dim, num_heads=num_heads, head_dim=head_dim, input_layernorm=True, output_activation=nn.ReLU, device=self.device), fan_in_layer=GRUGate(self.attention_dim, init_gru_gate_bias, device=self.device)) # Position-wise MultiLayerPerceptron part. E_layer = SkipConnection(nn.Sequential( th.nn.LayerNorm(self.attention_dim), FullyConnected(in_size=self.attention_dim, out_size=position_wise_mlp_dim, use_bias=False, activation_fn=nn.ReLU), FullyConnected(in_size=position_wise_mlp_dim, out_size=self.attention_dim, use_bias=False, activation_fn=nn.ReLU)), fan_in_layer=GRUGate(self.attention_dim, init_gru_gate_bias, device=self.device)) # Build a list of all attanlayers in order. transformer_layers.extend([MHA_layer, E_layer]) transformer_layers.append(th.nn.Flatten()) transformer_layers.append( nn.Linear(self.input_dim * self.attention_dim, self.features_dim)) self.transformer = nn.Sequential(*transformer_layers).to(self.device) print( "Swarmformer # trainable paramaters", sum(p.numel() for p in self.transformer.parameters() if p.requires_grad))
def test_save_load(tmp_path, model_class): """ Test if 'save' and 'load' saves and loads model correctly and if 'load_parameters' and 'get_policy_parameters' work correctly ''warning does not test function of optimizer parameter load :param model_class: (BaseAlgorithm) A RL model """ env = DummyVecEnv([lambda: select_env(model_class)]) # create model model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), verbose=1) model.learn(total_timesteps=500, eval_freq=250) env.reset() observations = np.concatenate([env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) # Get dictionary of current parameters params = deepcopy(model.policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values model.policy.load_state_dict(random_params) new_params = model.policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model # Check if the model loads as expected for every possible choice of device: for device in ["auto", "cpu", "cuda"]: model = model_class.load(str(tmp_path / "test_save.zip"), env=env, device=device) # check if the model was loaded to the correct device assert model.device.type == get_device(device).type assert model.policy.device.type == get_device(device).type # check if params are still the same after load new_params = model.policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert new_params[key].device.type == get_device(device).type assert th.allclose( params[key].to("cpu"), new_params[key].to("cpu") ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=1000, eval_freq=500) del model # clear file from os os.remove(tmp_path / "test_save.zip")
def load_from_zip_file( load_path: Union[str, pathlib.Path, io.BufferedIOBase], load_data: bool = True, custom_objects: Optional[Dict[str, Any]] = None, device: Union[th.device, str] = "auto", verbose: int = 0, print_system_info: bool = False, ) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict], Optional[TensorDict]]): """ Load model data from a .zip archive :param load_path: Where to load the model from :param load_data: Whether we should load and return data (class parameters). Mainly used by 'load_parameters' to only load model parameters (weights) :param custom_objects: Dictionary of objects to replace upon loading. If a variable is present in this dictionary as a key, it will not be deserialized and the corresponding item will be used instead. Similar to custom_objects in ``keras.models.load_model``. Useful when you have an object in file that can not be deserialized. :param device: Device on which the code should run. :param verbose: Verbosity level, 0 means only warnings, 2 means debug information. :param print_system_info: Whether to print or not the system info about the saved model. :return: Class parameters, model state_dicts (aka "params", dict of state_dict) and dict of pytorch variables """ load_path = open_path(load_path, "r", verbose=verbose, suffix="zip") # set device to cpu if cuda is not available device = get_device(device=device) # Open the zip archive and load data try: with zipfile.ZipFile(load_path) as archive: namelist = archive.namelist() # If data or parameters is not in the # zip archive, assume they were stored # as None (_save_to_file_zip allows this). data = None pytorch_variables = None params = {} # Debug system info first if print_system_info: if "system_info.txt" in namelist: print("== SAVED MODEL SYSTEM INFO ==") print(archive.read("system_info.txt").decode()) else: warnings.warn( "The model was saved with SB3 <= 1.2.0 and thus cannot print system information.", UserWarning, ) if "data" in namelist and load_data: # Load class parameters that are stored # with either JSON or pickle (not PyTorch variables). json_data = archive.read("data").decode() data = json_to_data(json_data, custom_objects=custom_objects) # Check for all .pth files and load them using th.load. # "pytorch_variables.pth" stores PyTorch variables, and any other .pth # files store state_dicts of variables with custom names (e.g. policy, policy.optimizer) pth_files = [ file_name for file_name in namelist if os.path.splitext(file_name)[1] == ".pth" ] for file_path in pth_files: with archive.open(file_path, mode="r") as param_file: # File has to be seekable, but param_file is not, so load in BytesIO first # fixed in python >= 3.7 file_content = io.BytesIO() file_content.write(param_file.read()) # go to start of file file_content.seek(0) # Load the parameters with the right ``map_location``. # Remove ".pth" ending with splitext th_object = th.load(file_content, map_location=device) # "tensors.pth" was renamed "pytorch_variables.pth" in v0.9.0, see PR #138 if file_path == "pytorch_variables.pth" or file_path == "tensors.pth": # PyTorch variables (not state_dicts) pytorch_variables = th_object else: # State dicts. Store into params dictionary # with same name as in .zip file (without .pth) params[os.path.splitext(file_path)[0]] = th_object except zipfile.BadZipFile: # load_path wasn't a zip file raise ValueError(f"Error: the file {load_path} wasn't a zip-file") return data, params, pytorch_variables
def __init__( self, observation_space: gym.Space, action_space: gym.Space, *, policy_class: Type[policies.BasePolicy] = base.FeedForward32Policy, policy_kwargs: Optional[Mapping[str, Any]] = None, expert_data: Union[ types.TransitionsMinimal, datasets.Dataset[types.TransitionsMinimal], None, ] = None, batch_size: int = 32, optimizer_cls: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None, ent_weight: float = 1e-3, l2_weight: float = 0.0, device: Union[str, th.device] = "auto", ): """Behavioral cloning (BC). Recovers a policy via supervised learning on a Dataset of observation-action pairs. Args: observation_space: the observation space of the environment. action_space: the action space of the environment. policy_class: used to instantiate imitation policy. policy_kwargs: keyword arguments passed to policy's constructor. expert_data: If not None, then immediately call `self.set_expert_dataset(expert_data)` during initialization. batch_size: batch size used for training. optimizer_cls: optimiser to use for supervised training. optimizer_kwargs: keyword arguments, excluding learning rate and weight decay, for optimiser construction. ent_weight: scaling applied to the policy's entropy regularization. l2_weight: scaling applied to the policy's L2 regularization. device: name/identity of device to place policy on. """ if optimizer_kwargs: if "weight_decay" in optimizer_kwargs: raise ValueError( "Use the parameter l2_weight insteand of weight_decay." ) self.action_space = action_space self.observation_space = observation_space self.policy_class = policy_class self.device = device = utils.get_device(device) self.policy_kwargs = dict( observation_space=self.observation_space, action_space=self.action_space, lr_schedule=ConstantLRSchedule(), device=self.device, ) self.policy_kwargs.update(policy_kwargs or {}) self.device = utils.get_device(device) self.policy = self.policy_class(**self.policy_kwargs).to( self.device ) # pytype: disable=not-instantiable optimizer_kwargs = optimizer_kwargs or {} self.optimizer = optimizer_cls(self.policy.parameters(), **optimizer_kwargs) assert batch_size >= 1 self.batch_size = batch_size self.expert_dataset: Optional[datasets.Dataset[types.TransitionsMinimal]] = None self.ent_weight = ent_weight self.l2_weight = l2_weight if expert_data is not None: self.set_expert_dataset(expert_data)
def __init__( self, attacker_policy: Type[BasePolicy], defender_policy: Type[BasePolicy], env: Union[GymEnv, str, None], policy_base: Type[BasePolicy], attacker_learning_rate: Union[float, Schedule], defender_learning_rate: Union[float, Schedule], attacker_policy_kwargs: Dict[str, Any] = None, defender_policy_kwargs: Dict[str, Any] = None, tensorboard_log: Optional[str] = None, device: Union[th.device, str] = "auto", seed: Optional[int] = None, train_mode: TrainMode = TrainMode.TRAIN_ATTACKER, attacker_agent_config: AgentConfig = None, defender_agent_config: AgentConfig = None ): self.attacker_agent_config = attacker_agent_config self.defender_agent_config = defender_agent_config try: self.tensorboard_writer = SummaryWriter(self.attacker_agent_config.tensorboard_dir) self.tensorboard_writer.add_hparams(self.attacker_agent_config.hparams_dict(), {}) except: print("error creating tensorboard writer") # try: # self.tensorboard_writer = SummaryWriter(self.attacker_agent_config.tensorboard_dir) # self.tensorboard_writer.add_hparams(self.attacker_agent_config.hparams_dict(), {}) # except: # print("error creating tensorboard writer") if isinstance(attacker_policy, str) and policy_base is not None: self.attacker_policy_class = get_policy_from_name(policy_base, attacker_policy) else: self.attacker_policy_class = attacker_policy if isinstance(defender_policy, str) and policy_base is not None: self.defender_policy_class = get_policy_from_name(policy_base, defender_policy) else: self.defender_policy_class = defender_policy self.device = get_device(device) self.env = None self._vec_normalize_env = unwrap_vec_normalize(env) self.attacker_policy_kwargs = {} if attacker_policy_kwargs is None else attacker_policy_kwargs self.defender_policy_kwargs = {} if defender_policy_kwargs is None else defender_policy_kwargs self.attacker_observation_space = None self.attacker_action_space = None self.defender_observation_space = None self.defender_action_space = None self.n_envs = None self.num_timesteps = 0 # Used for updating schedules self._total_timesteps = 0 self.seed = seed self.start_time = None self.attacker_policy = None self.defender_policy = None self.attacker_learning_rate = attacker_learning_rate self.defender_learning_rate = defender_learning_rate self.tensorboard_log = tensorboard_log self._last_obs = None self._last_episode_starts = None self._last_original_obs = None self._last_dones = None self._episode_num = 0 self._current_progress_remaining = 1 self.ep_info_buffer = None self.ep_success_buffer = None self._n_updates = 0 self.train_mode = train_mode self.train_result = ExperimentResult() self.eval_result = ExperimentResult() self.training_start = time.time() # Create and wrap the env if needed if env is not None: env = maybe_make_env(env) env = self._wrap_env(env) self.attacker_observation_space = env.attacker_observation_space self.attacker_action_space = env.attacker_action_space self.defender_observation_space = env.defender_observation_space self.defender_action_space = env.defender_action_space self.n_envs = env.num_envs self.env = env
def test_save_load(tmp_path, model_class): """ Test if 'save' and 'load' saves and loads model correctly and if 'get_parameters' and 'set_parameters' and work correctly. ''warning does not test function of optimizer parameter load :param model_class: (BaseAlgorithm) A RL model """ env = DummyVecEnv([lambda: select_env(model_class)]) # create model model = model_class("MlpPolicy", env, policy_kwargs=dict(net_arch=[16]), verbose=1) model.learn(total_timesteps=500) env.reset() observations = np.concatenate([env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) # Get parameters of different objects # deepcopy to avoid referencing to tensors we are about to modify original_params = deepcopy(model.get_parameters()) # Test different error cases of set_parameters. # Test that invalid object names throw errors invalid_object_params = deepcopy(original_params) invalid_object_params["I_should_not_be_a_valid_object"] = "and_I_am_an_invalid_tensor" with pytest.raises(ValueError): model.set_parameters(invalid_object_params, exact_match=True) with pytest.raises(ValueError): model.set_parameters(invalid_object_params, exact_match=False) # Test that exact_match catches when something was missed. missing_object_params = dict((k, v) for k, v in list(original_params.items())[:-1]) with pytest.raises(ValueError): model.set_parameters(missing_object_params, exact_match=True) # Test that exact_match catches when something inside state-dict # is missing but we have exact_match. missing_state_dict_tensor_params = {} for object_name in original_params: object_params = {} missing_state_dict_tensor_params[object_name] = object_params # Skip last item in state-dict for k, v in list(original_params[object_name].items())[:-1]: object_params[k] = v with pytest.raises(RuntimeError): # PyTorch load_state_dict throws RuntimeError if strict but # invalid state-dict. model.set_parameters(missing_state_dict_tensor_params, exact_match=True) # Test that parameters do indeed change. random_params = {} for object_name, params in original_params.items(): # Do not randomize optimizer parameters (custom layout) if "optim" in object_name: random_params[object_name] = params else: # Again, skip the last item in state-dict random_params[object_name] = OrderedDict( (param_name, th.rand_like(param)) for param_name, param in list(params.items())[:-1] ) # Update model parameters with the new random values model.set_parameters(random_params, exact_match=False) new_params = model.get_parameters() # Check that all params except the final item in each state-dict are different. for object_name in original_params: # Skip optimizers (no valid comparison with just th.allclose) if "optim" in object_name: continue # state-dicts use ordered dictionaries, so key order # is guaranteed. last_key = list(original_params[object_name].keys())[-1] for k in original_params[object_name]: if k == last_key: # Should be same as before assert th.allclose( original_params[object_name][k], new_params[object_name][k] ), "Parameter changed despite not included in the loaded parameters." else: # Should be different assert not th.allclose( original_params[object_name][k], new_params[object_name][k] ), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model # Check if the model loads as expected for every possible choice of device: for device in ["auto", "cpu", "cuda"]: model = model_class.load(str(tmp_path / "test_save.zip"), env=env, device=device) # check if the model was loaded to the correct device assert model.device.type == get_device(device).type assert model.policy.device.type == get_device(device).type # check if params are still the same after load new_params = model.get_parameters() # Check that all params are the same as before save load procedure now for object_name in new_params: # Skip optimizers (no valid comparison with just th.allclose) if "optim" in object_name: continue for key in params[object_name]: assert new_params[object_name][key].device.type == get_device(device).type assert th.allclose( params[object_name][key].to("cpu"), new_params[object_name][key].to("cpu") ), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=500) del model # clear file from os os.remove(tmp_path / "test_save.zip")
def load_from_zip_file( load_path: Union[str, pathlib.Path, io.BufferedIOBase], load_data: bool = True, device: Union[th.device, str] = "auto", verbose: int = 0, ) -> (Tuple[Optional[Dict[str, Any]], Optional[TensorDict], Optional[TensorDict]]): """ Load model data from a .zip archive :param load_path: Where to load the model from :param load_data: Whether we should load and return data (class parameters). Mainly used by 'load_parameters' to only load model parameters (weights) :param device: Device on which the code should run. :return: Class parameters, model state_dicts (aka "params", dict of state_dict) and dict of pytorch variables """ load_path = open_path(load_path, "r", verbose=verbose, suffix="zip") # set device to cpu if cuda is not available device = get_device(device=device) # Open the zip archive and load data try: with zipfile.ZipFile(load_path) as archive: namelist = archive.namelist() # If data or parameters is not in the # zip archive, assume they were stored # as None (_save_to_file_zip allows this). data = None pytorch_variables = None params = {} if "data" in namelist and load_data: # Load class parameters that are stored # with either JSON or pickle (not PyTorch variables). json_data = archive.read("data").decode() data = json_to_data(json_data) # Check for all .pth files and load them using th.load. # "pytorch_variables.pth" stores PyTorch variables, and any other .pth # files store state_dicts of variables with custom names (e.g. policy, policy.optimizer) pth_files = [ file_name for file_name in namelist if os.path.splitext(file_name)[1] == ".pth" ] for file_path in pth_files: with archive.open(file_path, mode="r") as param_file: # File has to be seekable, but param_file is not, so load in BytesIO first # fixed in python >= 3.7 file_content = io.BytesIO() file_content.write(param_file.read()) # go to start of file file_content.seek(0) # Load the parameters with the right ``map_location``. # Remove ".pth" ending with splitext th_object = th.load(file_content, map_location=device) # "tensors.pth" was renamed "pytorch_variables.pth" in v0.9.0, see PR #138 if file_path == "pytorch_variables.pth" or file_path == "tensors.pth": # PyTorch variables (not state_dicts) pytorch_variables = th_object else: # State dicts. Store into params dictionary # with same name as in .zip file (without .pth) params[os.path.splitext(file_path)[0]] = th_object except zipfile.BadZipFile: # load_path wasn't a zip file raise ValueError(f"Error: the file {load_path} wasn't a zip-file") return data, params, pytorch_variables