def __init__(self): # #### Configurations # number of workers self.n_workers = 8 # steps sampled on each update self.worker_steps = 4 # number of training iterations self.train_epochs = 8 # number of updates self.updates = 1_000_000 # size of mini batch for training self.mini_batch_size = 32 # exploration as a function of updates self.exploration_coefficient = Piecewise( [ (0, 1.0), (25_000, 0.1), (self.updates / 2, 0.01) ], outside_value=0.01) # update target network every 250 update self.update_target_model = 250 # $\beta$ for replay buffer as a function of updates self.prioritized_replay_beta = Piecewise( [ (0, 0.4), (self.updates, 1) ], outside_value=1) # Replay buffer with $\alpha = 0.6$. Capacity of the replay buffer must be a power of 2. self.replay_buffer = ReplayBuffer(2 ** 14, 0.6) # Model for sampling and training self.model = Model().to(device) # target model to get $\color{orange}Q(s';\color{orange}{\theta_i^{-}})$ self.target_model = Model().to(device) # create workers self.workers = [Worker(47 + i) for i in range(self.n_workers)] # initialize tensors for observations self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8) for worker in self.workers: worker.child.send(("reset", None)) for i, worker in enumerate(self.workers): self.obs[i] = worker.child.recv() # loss function self.loss_func = QFuncLoss(0.99) # optimizer self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2.5e-4)
def __init__(self): # #### Configurations # number of updates self.updates = 10000 # number of epochs to train the model with sampled data self.epochs = 4 # number of worker processes self.n_workers = 8 # number of steps to run on each process for a single update self.worker_steps = 128 # number of mini batches self.n_mini_batch = 4 # total number of samples for a single update self.batch_size = self.n_workers * self.worker_steps # size of a mini batch self.mini_batch_size = self.batch_size // self.n_mini_batch assert (self.batch_size % self.n_mini_batch == 0) # #### Initialize # create workers self.workers = [Worker(47 + i) for i in range(self.n_workers)] # initialize tensors for observations self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8) for worker in self.workers: worker.child.send(("reset", None)) for i, worker in enumerate(self.workers): self.obs[i] = worker.child.recv() # model self.model = Model().to(device) # optimizer self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4) # GAE with $\gamma = 0.99$ and $\lambda = 0.95$ self.gae = GAE(self.n_workers, self.worker_steps, 0.99, 0.95) # PPO Loss self.ppo_loss = ClippedPPOLoss() # Value Loss self.value_loss = ClippedValueFunctionLoss()
def __init__( self, *, updates: int, epochs: IntDynamicHyperParam, n_workers: int, worker_steps: int, batches: int, value_loss_coef: FloatDynamicHyperParam, entropy_bonus_coef: FloatDynamicHyperParam, clip_range: FloatDynamicHyperParam, learning_rate: FloatDynamicHyperParam, ): # #### Configurations # number of updates self.updates = updates # number of epochs to train the model with sampled data self.epochs = epochs # number of worker processes self.n_workers = n_workers # number of steps to run on each process for a single update self.worker_steps = worker_steps # number of mini batches self.batches = batches # total number of samples for a single update self.batch_size = self.n_workers * self.worker_steps # size of a mini batch self.mini_batch_size = self.batch_size // self.batches assert (self.batch_size % self.batches == 0) # Value loss coefficient self.value_loss_coef = value_loss_coef # Entropy bonus coefficient self.entropy_bonus_coef = entropy_bonus_coef # Clipping range self.clip_range = clip_range # Learning rate self.learning_rate = learning_rate # #### Initialize # create workers self.workers = [Worker(47 + i) for i in range(self.n_workers)] # initialize tensors for observations self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8) for worker in self.workers: worker.child.send(("reset", None)) for i, worker in enumerate(self.workers): self.obs[i] = worker.child.recv() # model self.model = Model().to(device) # optimizer self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4) # GAE with $\gamma = 0.99$ and $\lambda = 0.95$ self.gae = GAE(self.n_workers, self.worker_steps, 0.99, 0.95) # PPO Loss self.ppo_loss = ClippedPPOLoss() # Value Loss self.value_loss = ClippedValueFunctionLoss()