def __init__(self, observation_space, action_space, config): """ Example of a config = { 'actions': {0, 1, 2}, 'alpha': 0.1, 'epsilon': 0.1, 'gamma': 0.6, 'seed': 42, 'init': 0.0, } """ Policy.__init__(self, observation_space, action_space, config) # Parameters self.set_of_actions = deepcopy(config['actions']) self.alpha = deepcopy(config['alpha']) self.gamma = deepcopy(config['gamma']) self.epsilon = deepcopy(config['epsilon']) self.qtable = QTable(self.set_of_actions, default=config['init'], seed=config['seed']) self.qtable_state_action_counter = QTable(self.set_of_actions, default=0) self.qtable_state_action_reward = QTable(self.set_of_actions, default=list()) # self.qtable_new_state_action_total_reward = QTable(self.set_of_actions, default=list()) self.rndgen = RandomState(config['seed']) # Logging self.stats = dict() self._reset_stats_values()
def build_q_models(policy: Policy, obs_space: gym.Space, action_space: gym.Space, config: TrainerConfigDict) -> ModelV2: if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_SCOPE) policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_TARGET_SCOPE) policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def load_metanash_pure_strat(policy: Policy, pure_strat_spec: StrategySpec): pure_strat_checkpoint_path = pure_strat_spec.metadata["checkpoint_path"] checkpoint_data = deepdish.io.load(path=pure_strat_checkpoint_path) weights = checkpoint_data["weights"] weights = {k.replace("_dot_", "."): v for k, v in weights.items()} policy.set_weights(weights=weights) policy.p2sro_policy_spec = pure_strat_spec
def __init__(self, observation_space, action_space, config, dqn_config): Policy.__init__(self, observation_space, action_space, config) self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch. cuda.is_available() else "cpu") self.dqn_config = dqn_config self.epsilon = 1 self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n print( f'dqn state space:{self.num_states}, action space:{self.num_actions}' ) # self.eval_net = DQNModule(self.num_states, self.num_actions).to(self.device) # self.target_net = DQNModule(self.num_states, self.num_actions).to(self.device) self.eval_net = DQNActionModule(self.device, self.num_states, self.num_actions).to(self.device) self.target_net = DQNActionModule(self.device, self.num_states, self.num_actions).to(self.device) self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter = 0 self.memory = replay_memory(dqn_config['replay_capacity'], num_result=6) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=dqn_config['lr']) # self.loss_func = nn.SmoothL1Loss() self.loss_func = nn.MSELoss().to(self.device) self.rand_action = 0 self.greedy_action = 0
def spl_torch_loss( policy: Policy, model: ModelV2, dist_class: Type[TorchDistributionWrapper], train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]: """The basic policy gradients loss function. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]: The action distr. class. train_batch (SampleBatch): The training data. Returns: Union[TensorType, List[TensorType]]: A single loss tensor or a list of loss tensors. """ # Pass the training data through our model to get distribution parameters. dist_inputs, _ = model.from_batch(train_batch) # Create an action distribution object. predictions = dist_class(dist_inputs, model) targets = [] if policy.config["learn_action"]: targets.append(train_batch[SampleBatch.ACTIONS]) if policy.config["learn_reward"]: targets.append(train_batch[SampleBatch.REWARDS]) assert len(targets) > 0 targets = torch.cat(targets, dim=0) # Save the loss in the policy object for the spl_stats below. policy.spl_loss = policy.config["loss_fn"](predictions.dist.probs, targets) policy.entropy = predictions.dist.entropy().mean() return policy.spl_loss
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.observation_space = observation_space self.action_space = action_space self.config = config self.action_shape = action_space.n # GPU settings self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # This attribute will be incremented every time learn_on_batch is called. self.iteration = 0 # The current time step. self.current_step = 0 # Agent parameters. self.lr = self.config["lr"] self.gamma = self.config["gamma"] self.target_update_frequency = self.config["target_update_frequency"] # Strategy self.strategy = \ EpsilonGreedyStrategy(self.config["eps_start"], self.config["eps_end"], self.config["eps_decay"]) # Replay memory self.memory = ReplayMemory(self.config["replay_memory_size"]) # Policy network self.policy_net = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=4, name="DQNModel", model_config=self.config["dqn_model"], framework="torch", ).to(self.device, non_blocking=True) # Target network self.target_net = ModelCatalog.get_model_v2( obs_space=self.observation_space, action_space=self.action_space, num_outputs=4, name="DQNModel", model_config=self.config["dqn_model"], framework="torch", ).to(self.device, non_blocking=True) # Set the weights & biases in the target_net to be the same as those in the policy_net. self.target_net.load_state_dict(self.policy_net.state_dict()) # Put target_net in eval mode. This network will only be used for inference. self.target_net.eval() # Optimizer. self.optimizer = optim.RMSprop(self.policy_net.parameters()) # The calculated loss. self.loss = 0
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) # You can replace this with whatever variable you want to save # the state of the policy in. `get_weights` and `set_weights` # are used for checkpointing the states and restoring the states # from a checkpoint. self.w = []
def build_q_losses( policy: Policy, model: ModelV2, dist_class: Type[TFActionDistribution], train_batch: SampleBatch, ) -> TensorType: """Constructs the loss for SimpleQTFPolicy. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]): The action distribution class. train_batch (SampleBatch): The training data. Returns: TensorType: A single loss tensor. """ # q network evaluation q_t = compute_q_values(policy, policy.model, train_batch[SampleBatch.CUR_OBS], explore=False) # target q network evalution q_tp1 = compute_q_values(policy, policy.target_model, train_batch[SampleBatch.NEXT_OBS], explore=False) if not hasattr(policy, "q_func_vars"): policy.q_func_vars = model.variables() policy.target_q_func_vars = policy.target_model.variables() # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot( tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32), policy.action_space.n) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) # compute estimate of best possible value starting from state at t + 1 dones = tf.cast(train_batch[SampleBatch.DONES], tf.float32) q_tp1_best_one_hot_selection = tf.one_hot(tf.argmax(q_tp1, 1), policy.action_space.n) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_tp1_best_masked = (1.0 - dones) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = (train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * q_tp1_best_masked) # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) loss = tf.reduce_mean(huber_loss(td_error)) # save TD error as an attribute for outside access policy.td_error = td_error return loss
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.method = Method() self.episode_length = episode_length = config['rollout_fragment_length'] self.n_envs = n_envs = config['num_envs_per_worker'] MAX_BUFFER_SIZE = 1000 self.total_envs = total_envs = config['num_workers'] * config['num_envs_per_worker'] self.buffer = TrajBuffer(episode_length, total_envs, MAX_BUFFER_SIZE)
def build_q_losses(policy: Policy, model, dist_class, train_batch: SampleBatch) -> TensorType: """Constructs the loss for SimpleQTorchPolicy. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]): The action distribution class. train_batch (SampleBatch): The training data. Returns: TensorType: A single loss tensor. """ # q network evaluation q_t = compute_q_values(policy, policy.q_model, train_batch[SampleBatch.CUR_OBS], explore=False, is_training=True) # target q network evalution q_tp1 = compute_q_values(policy, policy.target_q_model, train_batch[SampleBatch.NEXT_OBS], explore=False, is_training=True) # q scores for actions which we know were selected in the given state. one_hot_selection = F.one_hot(train_batch[SampleBatch.ACTIONS].long(), policy.action_space.n) q_t_selected = torch.sum(q_t * one_hot_selection, 1) # compute estimate of best possible value starting from state at t + 1 dones = train_batch[SampleBatch.DONES].float() q_tp1_best_one_hot_selection = F.one_hot(torch.argmax(q_tp1, 1), policy.action_space.n) q_tp1_best = torch.sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_tp1_best_masked = (1.0 - dones) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = (train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * q_tp1_best_masked) # Compute the error (Square/Huber). td_error = q_t_selected - q_t_selected_target.detach() # loss = torch.mean(huber_loss(td_error)) # NFSP on Kuhn/Leduc poker fails with huber_loss loss = F.mse_loss(input=q_t_selected, target=q_t_selected_target.detach()) # save TD error as an attribute for outside access policy.td_error = td_error policy.loss = loss return loss
def load_pure_strat(policy: Policy, pure_strat_spec, checkpoint_path: str = None): assert pure_strat_spec is None or checkpoint_path is None, "can only pass one or the other" if checkpoint_path is None: if hasattr(policy, "policy_spec") and pure_strat_spec == policy.policy_spec: return pure_strat_checkpoint_path = pure_strat_spec.metadata["checkpoint_path"] else: pure_strat_checkpoint_path = checkpoint_path checkpoint_data = deepdish.io.load(path=pure_strat_checkpoint_path) weights = checkpoint_data["weights"] weights = {k.replace("_dot_", "."): v for k, v in weights.items()} policy.set_weights(weights=weights) policy.policy_spec = pure_strat_spec
def load_pure_strat_cached(policy: Policy, pure_strat_spec): pure_strat_checkpoint_path = pure_strat_spec.metadata[ "checkpoint_path"] if pure_strat_checkpoint_path in cache: weights = cache[pure_strat_checkpoint_path] else: checkpoint_data = deepdish.io.load(path=pure_strat_checkpoint_path) weights = checkpoint_data["weights"] weights = {k.replace("_dot_", "."): v for k, v in weights.items()} cache[pure_strat_checkpoint_path] = weights policy.set_weights(weights=weights) policy.policy_spec = pure_strat_spec
def pg_tf_loss( policy: Policy, model: ModelV2, dist_class: Type[ActionDistribution], train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]: """The basic policy gradients loss function. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]: The action distr. class. train_batch (SampleBatch): The training data. Returns: Union[TensorType, List[TensorType]]: A single loss tensor or a list of loss tensors. """ # Pass the training data through our model to get distribution parameters. dist_inputs, _ = model(train_batch) # Create an action distribution object. action_dist = dist_class(dist_inputs, model) # Calculate the vanilla PG loss based on: # L = -E[ log(pi(a|s)) * A] loss = -tf.reduce_mean( action_dist.logp(train_batch[SampleBatch.ACTIONS]) * tf.cast(train_batch[Postprocessing.ADVANTAGES], dtype=tf.float32)) policy.policy_loss = loss return loss
def pg_torch_loss( policy: Policy, model: ModelV2, dist_class: Type[TorchDistributionWrapper], train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]: """The basic policy gradients loss function. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]: The action distr. class. train_batch (SampleBatch): The training data. Returns: Union[TensorType, List[TensorType]]: A single loss tensor or a list of loss tensors. """ # Pass the training data through our model to get distribution parameters. dist_inputs, _ = model.from_batch(train_batch) # Create an action distribution object. action_dist = dist_class(dist_inputs, model) # Calculate the vanilla PG loss based on: # L = -E[ log(pi(a|s)) * A] log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) # Save the loss in the policy object for the stats_fn below. policy.pi_err = -torch.mean( log_probs * train_batch[Postprocessing.ADVANTAGES]) return policy.pi_err
def get_distribution_inputs_and_class( policy: Policy, q_model: ModelV2, obs_batch: TensorType, *, explore=True, is_training=True, **kwargs) -> Tuple[TensorType, type, List[TensorType]]: """Build the action distribution""" q_vals = compute_q_values(policy, q_model, obs_batch, explore, is_training) q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals policy.q_values = q_vals policy.q_func_vars = q_model.variables() return policy.q_values, (TorchCategorical if policy.config["framework"] == "torch" else Categorical), [] # state-outs
def spl_torch_loss( policy: Policy, model: ModelV2, dist_class: Type[TorchDistributionWrapper], train_batch: SampleBatch, ) -> Union[TensorType, List[TensorType]]: """The basic policy gradients loss function. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]: The action distr. class. train_batch (SampleBatch): The training data. Returns: Union[TensorType, List[TensorType]]: A single loss tensor or a list of loss tensors. """ # Pass the training data through our model to get distribution parameters. dist_inputs, _ = model.from_batch(train_batch) # Create an action distribution object. action_dist = dist_class(dist_inputs, model) if policy.config["explore"]: # Adding that because of a bug in TorchCategorical # which modify dist_inputs through action_dist: _, _ = policy.exploration.get_exploration_action( action_distribution=action_dist, timestep=policy.global_timestep, explore=policy.config["explore"], ) action_dist = dist_class(dist_inputs, policy.model) targets = [] if policy.config["learn_action"]: targets.append(train_batch[SampleBatch.ACTIONS]) if policy.config["learn_reward"]: targets.append(train_batch[SampleBatch.REWARDS]) assert len(targets) > 0, (f"In config, use learn_action=True and/or " f"learn_reward=True to specify which target to " f"use in supervised learning") targets = torch.cat(targets, dim=0) # Save the loss in the policy object for the spl_stats below. policy.spl_loss = policy.config["loss_fn"](action_dist.dist.probs, targets) policy.entropy = action_dist.dist.entropy().mean() return policy.spl_loss
def _get_log_from_policy(policy: Policy, policy_id: PolicyID) -> dict: """Gets the to_log var from a policy and rename its keys, adding the policy_id as a prefix.""" to_log = {} if hasattr(policy, "to_log"): for k, v in policy.to_log.items(): to_log[f"{k}/{policy_id}"] = v policy.to_log = {} return to_log
def compute_action(policy: Policy, input_dict: Dict[str, np.ndarray], explore: bool) -> Any: """Compute predicted action by the policy. .. note:: It supports both Pytorch and Tensorflow backends (both eager and compiled graph modes). :param policy: `rllib.poli.Policy` to use to predict the action, which is a thin wrapper around the actual policy model. :param input_dict: Input dictionary for forward as input of the policy. :param explore: Whether or not to enable exploration during sampling of the action. """ if policy.framework == 'torch': with torch.no_grad(): input_dict = policy._lazy_tensor_dict(input_dict) action_logits, _ = policy.model(input_dict) action_dist = policy.dist_class(action_logits, policy.model) if explore: action_torch = action_dist.sample() else: action_torch = action_dist.deterministic_sample() action = action_torch.cpu().numpy() elif tf.compat.v1.executing_eagerly(): action_logits, _ = policy.model(input_dict) action_dist = policy.dist_class(action_logits, policy.model) if explore: action_tf = action_dist.sample() else: action_tf = action_dist.deterministic_sample() action = action_tf.numpy() else: # This obscure piece of code takes advantage of already existing # placeholders to avoid creating new nodes to evalute computation # graph. It is several order of magnitude more efficient than calling # `action_logits, _ = model(input_dict).eval(session=policy._sess)`. feed_dict = {policy._input_dict[key]: value for key, value in input_dict.items() if key in policy._input_dict.keys()} feed_dict[policy._is_exploring] = explore action = policy._sess.run( policy._sampled_action, feed_dict=feed_dict) return action
def _get_weights_from_policy(policy: Policy, policy_id: PolicyID) -> dict: """Gets the to_log var from a policy and rename its keys, adding the policy_id as a prefix.""" to_log = {} weights = policy.get_weights() for k, v in weights.items(): if isinstance(v, Iterable): to_log[f"{policy_id}/{k}"] = v return to_log
def build_q_models(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> ModelV2: """Build q_model and target_q_model for Simple Q learning Note that this function works for both Tensorflow and PyTorch. Args: policy (Policy): The Policy, which will use the model for optimization. obs_space (gym.spaces.Space): The policy's observation space. action_space (gym.spaces.Space): The policy's action space. config (TrainerConfigDict): Returns: ModelV2: The Model for the Policy to use. Note: The target q model will not be returned, just assigned to `policy.target_q_model`. """ if not isinstance(action_space, gym.spaces.MultiDiscrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.nvec.max(), model_config=config["model"], framework=config["framework"], name=Q_SCOPE) policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.nvec.max(), model_config=config["model"], framework=config["framework"], name=Q_TARGET_SCOPE) policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def compute_q_values(policy: Policy, model: ModelV2, obs: TensorType, explore, is_training=None) -> TensorType: _is_training = (is_training if is_training is not None else policy._get_is_training_placeholder()) model_out, _ = model(SampleBatch(obs=obs, _is_training=_is_training), [], None) return model_out
def __init__(self, eval_net, target_net, observation_space, action_space, config, dqn_config): Policy.__init__(self, observation_space, action_space, config) self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch. cuda.is_available() else "cpu") self.dqn_config = dqn_config self.epsilon = 1 self.prioritized_memory = dqn_config.get('prioritized_memry', False) # self.epsilon_delta = (dqn_config['update_period'] / dqn_config['replay_capacity']) self.epsilon_delta = 1e-3 self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n print( f'dqn state space:{self.num_states}, action space:{self.num_actions}' ) # self.eval_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device) # self.target_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device) self.eval_net = eval_net.to(self.device) self.target_net = target_net.to(self.device) self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter = 0 if self.prioritized_memory: self.memory = PrioritizedMemory(dqn_config['replay_capacity'], num_result=5) else: self.memory = Memory(dqn_config['replay_capacity'], num_result=5) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=dqn_config['lr']) # self.loss_func = nn.SmoothL1Loss() self.loss_func = nn.MSELoss().to(self.device) self.rand_action = 0 self.greedy_action = 0
def compute_q_values(policy: Policy, model: ModelV2, obs: TensorType, explore, is_training=None) -> TensorType: model_out, _ = model({ SampleBatch.CUR_OBS: obs, "is_training": is_training if is_training is not None else policy._get_is_training_placeholder(), }, [], None) return model_out
def load_pure_strat(policy: Policy, pure_strat_spec: StrategySpec = None, checkpoint_path: str = None, weights_key: str = "weights"): if pure_strat_spec is not None and checkpoint_path is not None: raise ValueError( "Can only pass pure_strat_spec or checkpoint_path but not both") if checkpoint_path is None: if hasattr(policy, "policy_spec") and pure_strat_spec == policy.policy_spec: return pure_strat_checkpoint_path = pure_strat_spec.metadata[ "checkpoint_path"] else: pure_strat_checkpoint_path = checkpoint_path weights = None try: num_load_attempts = 5 for attempt in range(num_load_attempts): try: checkpoint_data = deepdish.io.load( path=pure_strat_checkpoint_path) weights = checkpoint_data[weights_key] break except (HDF5ExtError, KeyError): if attempt + 1 == num_load_attempts: raise time.sleep(1.0) #TODO use correct exception except Exception: with open(pure_strat_checkpoint_path, "rb") as pickle_file: checkpoint_data = cloudpickle.load(pickle_file) weights = checkpoint_data[weights_key] weights = {k.replace("_dot_", "."): v for k, v in weights.items()} policy.set_weights(weights=weights) policy.policy_spec = pure_strat_spec
def __init__(self, agent_id, eval_net, target_net, observation_space, action_space, config, dqn_config): Policy.__init__(self, observation_space, action_space, config) self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch.cuda.is_available() else "cpu") self.dqn_config = dqn_config self.epsilon = 1 self.agent_id = agent_id # self.epsilon_delta = (dqn_config['update_period'] / dqn_config['replay_capacity']) self.epsilon_delta = 1e-3 self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n print(f'dqn state space:{self.num_states}, action space:{self.num_actions}') # self.eval_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device) # self.target_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device) self.eval_net = eval_net.to(self.device) self.target_net = target_net.to(self.device) parameters = set() for layer in self.eval_net.dp_models.keys(): parameters |= set(self.eval_net.dp_models[layer].parameters()) self.optimizer = torch.optim.Adam(parameters, lr=dqn_config['lr']) self.learn_step_counter = 0 self.memory = LayerMemory(dqn_config['replay_capacity'], num_result=5) # self.loss_func = nn.SmoothL1Loss() self.loss_func = nn.MSELoss().to(self.device) self.rand_action = 0 self.greedy_action = 0 self.x_action = [] for i in range(self.num_actions): _action = np.zeros(self.eval_net.transition_model.num_actions) _action[self.agent_id*self.num_actions+i] = 1.0 self.x_action.append(_action)
def __init__(self, agent_id, observation_space, action_space, dqn_config, models): Policy.__init__(self, observation_space, action_space, dqn_config) self.max_num_nodes = dqn_config['max_num_nodes'] self.dqn_config = dqn_config self.model_abstract_on = dqn_config['model_abstract_on'] self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n print( f'dqn state space:{self.num_states}, action space:{self.num_actions}' ) self.epsilon = 1 self.agent_id = agent_id self.learn_step_counter = 0 self.eval_net = models['eval_net'] self.target_net = models['target_net'] self.all_layers = self.eval_net.get_all_layers() self.policies = {} for layer in self.all_layers: policy = DQNDPTorchPolicy(agent_id, observation_space, action_space, dqn_config, layer, models) self.policies[layer] = policy
def pg_loss_stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]: """Returns the calculated loss in a stats dict. Args: policy (Policy): The Policy object. train_batch (SampleBatch): The data used for training. Returns: Dict[str, TensorType]: The stats dict. """ return { "policy_loss": torch.mean(torch.stack(policy.get_tower_stats("policy_loss"))), }
def stats_fn(policy: Policy, batch: SampleBatch) -> Dict[str, TensorType]: return {"loss": torch.mean(torch.stack(policy.get_tower_stats("loss")))}
def __init__(self, agent_id, observation_space, action_space, dqn_config, layer, models): Policy.__init__(self, observation_space, action_space, dqn_config) self.total_device_num = torch.cuda.device_count() self.device = torch.device(f"cuda:{layer % self.total_device_num}" if torch.cuda.is_available() else "cpu") self.dqn_config = dqn_config self.epsilon = 1 self.agent_id = agent_id self.layer = layer self.num_states = int(np.product(observation_space.shape)) self.num_actions = action_space.n # self.epsilon_delta = (dqn_config['update_period'] / dqn_config['replay_capacity']) self.model_abstract_on = dqn_config['model_abstract_on'] self.internal_update_freq = dqn_config['internal_update_freq'] self.batch_size = dqn_config['batch_size'] self.min_batch_size = dqn_config['min_batch_size'] self.epsilon_delta = 1e-3 self.encoder_feature_dim = self.num_states if self.model_abstract_on: self.encoder_feature_dim += dqn_config['encoder_feature_dim'] self.discount = dqn_config['dist_distance_discount'] self.bisim_coef = dqn_config['bisim_coef'] # self.eval_net = DQNDPModule(self.encoder_feature_dim, self.num_actions, dqn_config) # self.target_net = DQNDPModule(self.encoder_feature_dim, self.num_actions, dqn_config) self.eval_net = models['eval_net'] self.target_net = models['target_net'] self.optimizer = torch.optim.Adam(self.eval_net.get_parameters(layer), lr=dqn_config['lr']) self.learn_step_counter = 0 self.memory = LayerMemory(dqn_config['replay_capacity'], layer, self.batch_size, torch.device('cpu'), num_result=5) self.loss_func = nn.SmoothL1Loss().to(self.device) # self.loss_func = nn.MSELoss().to(self.device) self.rand_action = 0 self.greedy_action = 0 if self.model_abstract_on: decoder_parameters = set() self.target_encoder_model = models['target_encoder'] self.eval_encoder_model = models['eval_encoder'] self.eval_reward_model = models['eval_reward'] self.eval_transition_model = models['eval_transition'] self.encoder_optimizer = torch.optim.Adam( self.eval_encoder_model.get_parameters(layer), lr=dqn_config['lr']) decoder_parameters = ( self.eval_transition_model.get_parameters(layer) | self.eval_reward_model.get_parameters(layer)) self.decoder_optimizer = torch.optim.Adam(decoder_parameters, lr=dqn_config['lr'])
def set_policy_weights(policy: Policy, checkpoint_path: str): checkpoint_data = deepdish.io.load(path=checkpoint_path) weights = checkpoint_data["weights"] weights = {k.replace("_dot_", "."): v for k, v in weights.items()} policy.set_weights(weights)