def update(self, state, action, reward, next_state, done) -> float: loss: torch.Tensor = super().update(state, action, reward, next_state, done) if self.total_steps_done % 100 == 0: update_agent_model(self.model, self.target_model) return loss
def train(iterations: int, batch_size: int): state = env.reset() losses = [] all_rewards = [] episode_reward = 0 for iteration in range(1, iterations + 1): epsilon = epsilon_calculator.value(time_step=iteration) action = current_model.act(state=state, epsilon=epsilon) next_state, reward, done, _ = env.step(action) memory.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(memory) > batch_size: # when saved plays are greater than the batch size calculate losses loss = compute_td_loss(batch_size=batch_size, beta=args.beta) losses.append(loss.item()) if iteration % 100 == 0: update_agent_model(current=current_model, target=target_model) if iteration % 200 == 0: print('Iteration: {0}'.format(iteration)) print('Rewards: {0}'.format(all_rewards[-9:])) update_agent_model(current=current_model, target=target_model)
def __init__(self, num_features: int, num_actions: int, eps_calculator: Schedule, memory_eps_calculator: Schedule, memory_size: int = 10000, batch_size: int = 32, learning_rate: float = 2e-3, gamma: float = 0.99, memory_delay: int = 5000, representation_network: torch.nn.Module = None) -> None: # configure parent parameters super().__init__(num_features, num_actions, eps_calculator, memory_eps_calculator, memory_size, batch_size, learning_rate, gamma, memory_delay, representation_network) # target model needs repr network as well because otherwise copying over parameters will be non trivial self.target_model = DQN(num_features=num_features, num_actions=num_actions, representation_network=representation_network) update_agent_model(current=self.model, target=self.target_model)
def finish_training(self) -> None: super().finish_training() update_agent_model(self.model, self.target_model)
if __name__ == '__main__': parser = argparse.ArgumentParser(description='DDQN agent execution') parser.add_argument('--env', type=str, metavar='E', default='CartPole-v0', help='GYM environment') parser.add_argument('--init_eps', type=float, metavar='I', default=1.0, help='Initial epsilon') parser.add_argument('--min_eps', type=float, metavar='M', default=0.01, help='Minimum epsilon') parser.add_argument('--eps_decay', type=int, metavar='D', default=5000, help='Epsilon decay') parser.add_argument('--gamma', type=int, metavar='G', default=0.99, help='Gamma') parser.add_argument('--memory_size', type=int, metavar='S', default=10000, help='Memory size') parser.add_argument('--alpha', type=float, metavar='A', default=0.8, help='How much prioritization is used (0 - no prioritization, 1 - full prioritization)') parser.add_argument('--beta', type=float, metavar='B', default=0.8, help='Degree to use importance weights (0 - no corrections, 1 - full correction)') parser.add_argument('--batch_size', type=int, metavar='BS', default=32, help='Batch size') parser.add_argument('--iterations', type=int, metavar='IT', default=30000, help='Training iterations') args = parser.parse_args() # Environment information extraction env = gym.make(args.env) number_of_observations = env.observation_space.shape[0] number_of_actions = env.action_space.n # Agent creation and configuration current_model = DQN(num_features=number_of_observations, num_actions=number_of_actions) target_model = DQN(num_features=number_of_observations, num_actions=number_of_actions) update_agent_model(current=current_model, target=target_model) optimizer = optim.Adam(current_model.parameters()) memory = PrioritizedReplayMemory(capacity=args.memory_size, alpha=args.alpha) epsilon_calculator = ExponentialSchedule(initial_p=args.init_eps, min_p=args.min_eps, decay=args.eps_decay) # Training train(iterations=args.iterations, batch_size=args.batch_size) play(iterations=10000, render=True) save_model(target_model, "../../models/ddqn.model")