def __init__(self, config: Dict[str, Any]): super().__init__() default_config = { "num_subgoals": 2, "emb_size": 4, "rel_hiddens": (16, 16, ), "mlp_hiddens": (16, ), "activation": "leaky_relu" } self.config = with_default_config(config, default_config) self.activation: Callable[[Tensor], Tensor] = get_activation(self.config["activation"]) self.own_embedding = nn.Parameter(torch.randn(self.config["emb_size"])/10., requires_grad=True) self.agent_embedding = nn.Parameter(torch.randn(self.config["emb_size"])/10., requires_grad=True) self.subgoal_embedding = nn.Parameter(torch.randn(self.config["emb_size"])/10., requires_grad=True) self.goal_embedding = nn.Parameter(torch.randn(self.config["emb_size"])/10., requires_grad=True) rel_sizes = (2 * (self.config["emb_size"] + 3), ) + self.config["rel_hiddens"] mlp_sizes = (self.config["rel_hiddens"][-1], ) + self.config["mlp_hiddens"] self.relation_layers = nn.ModuleList([ nn.Linear(in_size, out_size) for in_size, out_size in zip(rel_sizes, rel_sizes[1:]) ]) self.mlp_layers = nn.ModuleList([ nn.Linear(in_size, out_size) for in_size, out_size in zip(mlp_sizes, mlp_sizes[1:]) ])
def __init__(self, config: Dict): super().__init__(config) default_config = { "input_shape": (100, 100), "num_actions": 5, "activation": "relu", } self.config = with_default_config(config, default_config) self.activation = get_activation(self.config["activation"]) input_shape: Tuple[int, int] = self.config["input_shape"] self.conv_layers = nn.ModuleList([ nn.Conv2d(4, 32, kernel_size=8, stride=4), # 24x24x32 nn.Conv2d(32, 64, kernel_size=7, stride=3), # 6x6x64 nn.Conv2d(64, 64, kernel_size=3, stride=1) ]) # 4x4x64 _coords_i = torch.linspace(-1, 1, input_shape[0]).view(-1, 1).repeat( 1, input_shape[1]) _coords_j = torch.linspace(-1, 1, input_shape[1]).view(1, -1).repeat( input_shape[0], 1) self.coords = torch.stack([_coords_i, _coords_j]) # flatten self.policy_head = nn.Linear(4 * 4 * 64, self.config["num_actions"]) self.value_head = nn.Linear(4 * 4 * 64, 1)
def __init__(self, agent: Agent, env: UnityEnvironment, config: Dict[str, Any]): super().__init__(agent, env, config) default_config = { "steps": 2048, # Tensorboard settings "tensorboard_name": None, # str, set explicitly # PPO "ppo_config": { # GD settings "optimizer": "adam", "optimizer_kwargs": { "lr": 1e-4, "betas": (0.9, 0.999), "eps": 1e-7, "weight_decay": 0, "amsgrad": False }, "gamma": .99, # Discount factor # PPO settings "ppo_steps": 25, # How many max. gradient updates in one iterations "eps": 0.1, # PPO clip parameter "target_kl": 0.01, # KL divergence limit "value_loss_coeff": 0.1, "entropy_coeff": 0.1, "max_grad_norm": 0.5, # Backpropagation settings "use_gpu": False, } } self.config = with_default_config(config, default_config) self.collector = Collector(agent=self.agent, env=self.env) self.ppo = PPOptimizer(agent=agent, config=self.config["ppo_config"]) # Setup tensorboard self.writer: SummaryWriter if self.config["tensorboard_name"]: dt_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") self.path = Path.home( ) / "drlnd_logs" / f"{self.config['tensorboard_name']}_{dt_string}" self.writer = SummaryWriter(str(self.path)) # Log the configs with open(str(self.path / "trainer_config.json"), "w") as f: json.dump(self.config, f) with open(str(self.path / f"agent_config.json"), "w") as f: json.dump(self.agent.model.config, f) self.path = str(self.path) else: self.writer = None
def __init__(self, config: Dict): super().__init__(config) default_config = { "input_size": 15, "num_actions": 5, "hidden_sizes": (64, 64), "activation": "leaky_relu", } self.config = with_default_config(config, default_config) input_size: int = self.config.get("input_size") num_actions: int = self.config.get("num_actions") hidden_sizes: Tuple[int] = self.config.get("hidden_sizes") self.activation: Callable = get_activation( self.config.get("activation")) layer_sizes = (input_size, ) + hidden_sizes self.hidden_layers = nn.ModuleList([ nn.Linear(in_size, out_size) for in_size, out_size in zip(layer_sizes, layer_sizes[1:]) ]) self.policy_head = nn.Linear(layer_sizes[-1], num_actions) self.value_head = nn.Linear(layer_sizes[-1], 1)
def __init__(self, config: Dict): super().__init__(config) default_config = { "input_shape": (100, 100), "num_actions": 5, "activation": "relu", } self.config = with_default_config(config, default_config) input_shape: Tuple[int, int] = self.config["input_shape"] input_size: int = self.config.get("input_size") num_actions: int = self.config.get("num_actions") hidden_sizes: Tuple[int] = self.config.get("hidden_sizes") self.activation: Callable = get_activation( self.config.get("activation")) self.conv = nn.Conv2d(3, 3, kernel_size=3, padding=1) layer_sizes = (input_size, ) + hidden_sizes self.hidden_layers = nn.ModuleList([ nn.Linear(in_size, out_size) for in_size, out_size in zip(layer_sizes, layer_sizes[1:]) ]) self.policy_head = nn.Linear(layer_sizes[-1], num_actions) self.value_head = nn.Linear(layer_sizes[-1], 1)
def __init__(self, config: Dict): super().__init__(config) torch.manual_seed(0) default_config = { "input_size": 33, "num_actions": 4, "activation": "relu", "hidden_sizes": (64, 64), } self.config = with_default_config(config, default_config) input_size: int = self.config.get("input_size") num_actions: int = self.config.get("num_actions") hidden_sizes: Tuple[int] = self.config.get("hidden_sizes") self.activation: Callable = get_activation(self.config.get("activation")) layer_sizes = (input_size,) + hidden_sizes self.hidden_layers = nn.ModuleList([ nn.Linear(in_size, out_size) for in_size, out_size in zip(layer_sizes, layer_sizes[1:]) ]) self.policy_mu_head = nn.Linear(layer_sizes[-1], num_actions) self.v_hidden_layers = nn.ModuleList([ nn.Linear(in_size, out_size) for in_size, out_size in zip(layer_sizes, layer_sizes[1:]) ]) self.std = nn.Parameter(torch.ones(1, num_actions)) self.value_head = nn.Linear(layer_sizes[-1], 1)
def __init__(self, agent: Agent, config: Dict[str, Any]): self.agent = agent default_config = { # GD settings "optimizer": "adam", "optimizer_kwargs": { "lr": 1e-3, "betas": (0.9, 0.999), "eps": 1e-7, "weight_decay": 0, "amsgrad": False }, "separate_optimizers": False, "gamma": 0.95, # Discount factor # "batch_size": 64, "minibatches": 32, # PPO settings "ppo_steps": 5, "eps": 0.1, # PPO clip parameter "target_kl": 0.01, # KL divergence limit "value_loss_coeff": 0.1, "entropy_coeff": 0.01, "entropy_decay_time": 100, # How many steps to decrease entropy to 0.1 of the original value "min_entropy": 0.01, # Minimum value of the entropy bonus - use this to disable decay "max_grad_norm": 0.5, # GPU "use_gpu": False, } self.config = with_default_config(config, default_config) self.optimizer = get_optimizer(self.config["optimizer"])(agent.model.parameters(), **self.config["optimizer_kwargs"]) self.gamma: float = self.config["gamma"] self.eps: float = self.config["eps"]
def __init__(self, config: Dict): super().__init__(config) default_config = { "input_shape": (100, 100), "num_actions": 3, "activation": "relu", "field_threshold": 6, "hidden_sizes": (64, 64), } self.config = with_default_config(config, default_config) self.activation = get_activation(self.config["activation"]) self.field_threshold = self.config["field_threshold"] hidden_sizes: Tuple[int] = self.config.get("hidden_sizes") input_shape: Tuple[int, int] = self.config["input_shape"] _coords_i = torch.linspace(-1, 1, input_shape[0]).view(-1, 1).repeat( 1, input_shape[1]) _coords_j = torch.linspace(-1, 1, input_shape[1]).view(1, -1).repeat( input_shape[0], 1) self.coords = torch.stack([_coords_i, _coords_j]) self.bilinear = nn.Bilinear(2, 2, 4) self.pool1 = nn.AvgPool2d((100, self.field_threshold)) self.pool2 = nn.AvgPool2d((100, 100 - 2 * self.field_threshold)) self.pool3 = nn.AvgPool2d((100, self.field_threshold)) # concat + flatten to [B, 3*4] layer_sizes = (12, ) + hidden_sizes self.hidden_layers = nn.ModuleList([ nn.Linear(in_size, out_size) for in_size, out_size in zip(layer_sizes, layer_sizes[1:]) ]) self.policy_head = nn.Linear(layer_sizes[-1], self.config["num_actions"]) self.value_head = nn.Linear(layer_sizes[-1], 1)
def __init__(self, agents: Dict[str, Agent], config: Dict[str, Any]): self.agents = agents default_config = { # GD settings "optimizer": "adam", "optimizer_kwargs": { "lr": 1e-3, "betas": (0.9, 0.999), "eps": 1e-7, "weight_decay": 0, "amsgrad": False }, # "batch_size": 64, "minibatches": 32, # PPO settings "ppo_steps": 5, "eps": 0.1, # PPO clip parameter "target_kl": 0.01, # KL divergence limit "value_loss_coeff": 0.1, "entropy_coeff": 0.01, "max_grad_norm": 0.5, # GPU "use_gpu": False, } self.config = with_default_config(config, default_config) self.optimizers = { agent_id: get_optimizer(self.config["optimizer"])( agent.model.parameters(), **self.config["optimizer_kwargs"]) for agent_id, agent in self.agents.items() } self.eps: float = self.config["eps"]
def __init__(self, agents: Dict[str, Agent], env: UnityEnvironment, config: Dict[str, Any]): super().__init__(agents, env, config) default_config = { "steps": 2000, # Tensorboard settings "tensorboard_name": None, # str, set explicitly "gamma": .99, # Discount factor "tau": .95, # PPO "ppo_config": { "optimizer": "adam", "optimizer_kwargs": { "lr": 1e-3, "betas": (0.9, 0.999), "eps": 1e-7, "weight_decay": 0, "amsgrad": False }, # "batch_size": 64, "minibatches": 32, # PPO settings "ppo_steps": 5, "eps": 0.1, # PPO clip parameter "target_kl": 0.01, # KL divergence limit "value_loss_coeff": 0.1, "entropy_coeff": 0.01, "max_grad_norm": 0.5, # GPU "use_gpu": False, } } self.config = with_default_config(config, default_config) self.collector = Collector(agents=self.agents, env=self.env) self.ppo = PPOptimizer(agents=agents, config=self.config["ppo_config"]) # Setup tensorboard self.writer: SummaryWriter if self.config["tensorboard_name"]: dt_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") self.path = Path.home( ) / "drlnd_logs" / f"{self.config['tensorboard_name']}_{dt_string}" self.writer = SummaryWriter(str(self.path)) self.agent_paths = [ self.path / agent_id for agent_id in self.agents ] for agent_path in self.agent_paths: os.mkdir(str(agent_path)) # Log the configs with open(str(self.path / "trainer_config.json"), "w") as f: json.dump(self.config, f) with open(str(self.path / f"agent0_config.json"), "w") as f: json.dump(self.agents["Agent0"].model.config, f) with open(str(self.path / f"agent1_config.json"), "w") as f: json.dump(self.agents["Agent1"].model.config, f) self.path = str(self.path) else: self.writer = None
def __init__(self, agents: Dict[str, Agent], env: MultiAgentEnv, config: Dict[str, Any]): self.agents = agents self.agent_ids: List[str] = list(agents.keys()) self.env = env default_config = { # Trainer settings "agents_to_optimize": None, # ids of agents that should be optimized "batch_size": 10000, # Number of steps to sample at each iteration, TODO: make it possible to use epochs # Agent settings "optimizer": "adam", "optimizer_kwargs": { "lr": 1e-3, "betas": (0.9, 0.999), "eps": 1e-7, "weight_decay": 0, "amsgrad": False }, "gamma": 0.95, # Discount factor "preserve_channels": False, # PPO settings "ppo_steps": 25, "eps": 0.1, # PPO clip parameter "target_kl": 0.01, # KL divergence limit "value_loss_coeff": 0.1, "entropy_coeff": 0.1, # Tensorboard settings "tensorboard_name": "test", # Compatibility "tuple_mode": False, # GPU "use_gpu": False, } self.config = with_default_config(config, default_config) self.agents_to_optimize: List[str] = self.agent_ids if self.config['agents_to_optimize'] is None \ else self.config['agents_to_optimize'] self.optimizers: Dict[str, Optimizer] = { agent_id: get_optimizer(self.config["optimizer"])( agent.model.parameters(), **self.config["optimizer_kwargs"]) for agent_id, agent in self.agents.items() if agent_id in self.agents_to_optimize } self.gamma: float = self.config["gamma"] # TODO use @property instead? self.eps: float = self.config["eps"] self.writer: SummaryWriter if self.config["tensorboard_name"]: dt_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") self.path = Path.home( ) / "tb_logs" / f"{self.config['tensorboard_name']}_{dt_string}" self.writer = SummaryWriter(str(self.path)) # Log the configs with open(str(self.path / "trainer_config.pkl"), "wb") as f: pickle.dump(self.config, f) for agent_id in self.agent_ids: with open(str(self.path / f"{agent_id}_config.pkl"), "wb") as f: pickle.dump(self.agents[agent_id].model.config, f) with open(str(self.path / "env_config.pkl"), "wb") as f: try: env_config = self.env.config pickle.dump(env_config, f) except AttributeError: pass else: self.writer = None self.collector = Collector(agents=self.agents, env=self.env, tuple_mode=self.config["tuple_mode"])
def train_on_data(self, data_batch: DataBatch, step: int = 0, extra_metrics: Optional[Dict[str, Any]] = None, timer: Optional[Timer] = None): """ Performs a single update step with PPO on the given batch of data. Args: data_batch: DataBatch, dictionary step: extra_metrics: timer: Returns: """ metrics = {} if timer is None: timer = Timer() for agent_id in self.agents_to_optimize: agent = self.agents[agent_id] optimizer = self.optimizers[agent_id] ####################################### Unpack and prepare the data ####################################### obs_batch = data_batch['observations'][agent_id] action_batch = data_batch['actions'][agent_id] reward_batch = data_batch['rewards'][agent_id] old_logprobs_batch = data_batch['logprobs'][agent_id] done_batch = data_batch['dones'][agent_id] if self.config["use_gpu"]: obs_batch = obs_batch.cuda() action_batch = action_batch.cuda() old_logprobs_batch = old_logprobs_batch.cuda() agent.model.cuda() logprob_batch, value_batch, entropy_batch = agent.evaluate_actions( obs_batch, action_batch) discounted_batch = discount_rewards_to_go(reward_batch, done_batch, self.gamma) if self.config["use_gpu"]: discounted_batch = discounted_batch.cuda() advantages_batch = (discounted_batch - value_batch.view(-1)).detach() advantages_batch = (advantages_batch - advantages_batch.mean()) / ( advantages_batch.std() + 1e-6) # Initialize metrics kl_divergence = 0. ppo_step = 0 value_loss = torch.tensor(0) policy_loss = torch.tensor(0) loss = torch.tensor(0) timer.checkpoint() for ppo_step in range(self.config["ppo_steps"]): logprob_batch, value_batch, entropy_batch = agent.evaluate_actions( obs_batch, action_batch) ######################################### Compute the loss ############################################# prob_ratio = torch.exp(logprob_batch - old_logprobs_batch) surr1 = prob_ratio * advantages_batch surr2 = torch.clamp(prob_ratio, 1. - self.eps, 1 + self.eps) * advantages_batch kl_divergence = torch.mean( old_logprobs_batch - logprob_batch).item() # review formula? policy_loss = -torch.min(surr1, surr2) value_loss = (value_batch.view(-1) - discounted_batch)**2 loss_batch = ( policy_loss.mean() + self.config["value_loss_coeff"] * value_loss.mean() - self.config["entropy_coeff"] * entropy_batch.mean()) loss = loss_batch.mean() ########################################### Update step ############################################### optimizer.zero_grad() loss.backward() optimizer.step() ### Early stopping ### if kl_divergence > self.config["target_kl"]: break if self.config["use_gpu"]: agent.model.cpu() metrics[f"{agent_id}/time_update"] = timer.checkpoint() metrics[f"{agent_id}/kl_divergence"] = kl_divergence metrics[f"{agent_id}/steps_made"] = ppo_step metrics[f"{agent_id}/policy_loss"] = policy_loss.mean().item() metrics[f"{agent_id}/value_loss"] = value_loss.mean().item() metrics[f"{agent_id}/total_loss"] = loss.detach().item() ############################################# Collect metrics ############################################ # Delay by one, so that the new episode starts after a done=True, with a 0 at the beginning episode_indices = done_batch.cumsum(dim=0)[:-1] episode_indices = torch.cat( [torch.tensor([0]), episode_indices]) # [0, 0, 0, ..., 1, 1, ..., 2, ..., ...] ep_ids, ep_lens_tensor = torch.unique(episode_indices, return_counts=True) ep_lens = tuple(ep_lens_tensor) # tuple of episode lengths # Group rewards by episode and sum them up ep_rewards = torch.tensor([ torch.sum(rewards) for rewards in torch.split(reward_batch, ep_lens) ]) ### Add new training-based metrics here ### metrics[f"{agent_id}/episode_len_mean"] = torch.mean( ep_lens_tensor.float()).item() metrics[f"{agent_id}/episode_reward_mean"] = torch.mean( ep_rewards).item() metrics[f"{agent_id}/episode_reward_median"] = torch.median( ep_rewards).item() metrics[f"{agent_id}/episode_reward_min"] = torch.min( ep_rewards).item() metrics[f"{agent_id}/episode_reward_max"] = torch.max( ep_rewards).item() metrics[f"{agent_id}/episode_reward_std"] = torch.std( ep_rewards).item() metrics[f"{agent_id}/episodes_this_iter"] = len(ep_ids) metrics[f"{agent_id}/mean_entropy"] = torch.mean( entropy_batch).item() metrics[f"{agent_id}/winrate"] = (reward_batch[torch.nonzero( done_batch).view(-1)].mean().item() + 1) / 2 if extra_metrics is not None: metrics = with_default_config( metrics, extra_metrics) # add extra_metrics if not computed here self.write_dict(metrics, step)