def setUp(self) -> None: self.net = Mock() self.agent = DummyAgent(net=self.net) self.env = [gym.make("CartPole-v0") for _ in range(2)] self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.n_steps = 3 self.gamma = 0.9 self.source = DiscountedExperienceSource( self.env, self.agent, n_steps=self.n_steps, gamma=self.gamma ) self.state = torch.ones(3) self.next_state = torch.zeros(3) self.reward = 1 self.exp1 = Experience( state=self.state, action=1, reward=self.reward, done=False, new_state=self.next_state, ) self.exp2 = Experience( state=self.next_state, action=1, reward=self.reward, done=False, new_state=self.state, ) self.env1 = Mock() self.env1.step = Mock( return_value=(self.next_state, self.reward, True, self.state) )
def setUp(self) -> None: self.net = Mock() self.agent = DummyAgent(net=self.net) self.env = gym.make("CartPole-v0") self.n_step = 2 self.source = NStepExperienceSource(self.env, self.agent, n_steps=self.n_step) self.device = torch.device('cpu') self.state = np.zeros([32, 32]) self.state_02 = np.ones([32, 32]) self.next_state = np.zeros([32, 32]) self.next_state_02 = np.ones([32, 32]) self.action = np.zeros([1]) self.action_02 = np.ones([1]) self.reward = np.zeros([1]) self.reward_02 = np.ones([1]) self.done = np.zeros([1]) self.done_02 = np.zeros([1]) self.experience01 = Experience(self.state, self.action, self.reward, self.done, self.next_state) self.experience02 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02) self.experience03 = Experience(self.state_02, self.action_02, self.reward_02, self.done_02, self.next_state_02)
def setUp(self) -> None: self.net = Mock() self.agent = DummyAgent(net=self.net) self.env = [gym.make("CartPole-v0") for _ in range(2)] self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.source = ExperienceSource(self.env, self.agent, n_steps=1) self.s1 = torch.ones(3) self.s2 = torch.zeros(3) self.mock_env = Mock() self.mock_env.step = Mock(return_value=(self.s1, 1, False, Mock())) self.exp1 = Experience(state=self.s1, action=1, reward=1, done=False, new_state=self.s2) self.exp2 = Experience(state=self.s1, action=1, reward=1, done=False, new_state=self.s2)
def train_batch( self, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Contains the logic for generating a new batch of data to be passed to the DataLoader. This is the same function as the standard DQN except that we dont update epsilon as it is always 0. The exploration comes from the noisy network. Returns: yields a Experience tuple containing the state, action, reward, done and next_state. """ episode_reward = 0 episode_steps = 0 while True: self.total_steps += 1 action = self.agent(self.state, self.device) next_state, r, is_done, _ = self.env.step(action[0]) episode_reward += r episode_steps += 1 exp = Experience(state=self.state, action=action[0], reward=r, done=is_done, new_state=next_state) self.buffer.append(exp) self.state = next_state if is_done: self.done_episodes += 1 self.total_rewards.append(episode_reward) self.total_episode_steps.append(episode_steps) self.avg_rewards = float( np.mean(self.total_rewards[-self.avg_reward_len:])) self.state = self.env.reset() episode_steps = 0 episode_reward = 0 states, actions, rewards, dones, new_states = self.buffer.sample( self.batch_size) for idx, _ in enumerate(dones): yield states[idx], actions[idx], rewards[idx], dones[ idx], new_states[idx] # Simulates epochs if self.total_steps % self.batches_per_epoch == 0: break
def train_batch( self, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Contains the logic for generating a new batch of data to be passed to the DataLoader Returns: yields a Experience tuple containing the state, action, reward, done and next_state. """ episode_reward = 0 episode_steps = 0 while True: self.total_steps += 1 action = self.agent(self.state, self.env, self.device) next_state, reward, is_done, info = self.env.step(action) episode_reward += reward episode_steps += 1 exp = Experience( state=self.state, action=action, reward=reward, done=is_done, new_state=next_state, ) self.agent.update_epsilon(self.global_step) self.buffer.append(exp) self.state = next_state if is_done: self.done_episodes += 1 self.total_rewards.append(episode_reward) self.total_step_count.append(episode_steps) self.total_at_goal.append(info["at_goal"]) self.state = self.env.reset() episode_steps = 0 episode_reward = 0 states, actions, rewards, dones, new_states = self.buffer.sample( self.batch_size ) for i, _ in enumerate(dones): yield states[i], actions[i], rewards[i], dones[i], new_states[i] # Simulates epochs if self.total_steps % self.batches_per_epoch == 0: break
def populate(self, warm_start: int) -> None: """Populates the buffer with initial experience.""" if warm_start > 0: self.state = self.env.reset() for _ in range(warm_start): action = self.agent(self.state, self.device) next_state, reward, done, _ = self.env.step(action[0]) exp = Experience(state=self.state, action=action[0], reward=reward, done=done, new_state=next_state) self.buffer.append(exp) self.state = next_state if done: self.state = self.env.reset()
def populate(self, warm_start: int) -> None: """Populates the buffer with initial experiences""" if warm_start > 0: self.state = self.env.reset() for _ in tqdm(range(warm_start), desc="Populate replay buffer"): if not self.resumed: self.agent.epsilon = 1.0 action = self.agent(self.state, self.env, self.device) next_state, reward, done, _ = self.env.step(action) exp = Experience( state=self.state, action=action, reward=reward, done=done, new_state=next_state, ) self.buffer.append(exp) self.state = next_state if done: self.state = self.env.reset()