def __init__( self, discount_rate: float, action_space_dim: int, observation_space_dim: int, policy_net_layers_spec: Tuple, value_net_layers_spec: Tuple, loss_fn, optimiser, random_seed: int = None, cuda: bool = False, learning_rate: float = None, policy_learning_rate: float = None, value_learning_rate: float = None, ): """ `learning_rate` is a default for the policy and value learning rates, if those aren't specified.""" if random_seed is not None: self.seed(random_seed) self.name = 'AC' # fallback to default ( easier compatibility with DQN) policy_learning_rate = policy_learning_rate or learning_rate value_learning_rate = value_learning_rate or learning_rate self.discount_rate = discount_rate self.policy_net = create_sequential_model( num_inputs=observation_space_dim, layers_spec=policy_net_layers_spec, num_outputs=action_space_dim, dropout_rate=0, activation_function='relu', final_activation=False) self.value_net = create_sequential_model( num_inputs=observation_space_dim, layers_spec=value_net_layers_spec, num_outputs=1, dropout_rate=0, activation_function='relu', final_activation=False) self.loss_fn = loss_fn() self.policy_optimiser = optimiser(params=self.policy_net.parameters(), lr=policy_learning_rate) self.value_optimiser = optimiser(params=self.value_net.parameters(), lr=value_learning_rate) if cuda: self.policy_net.cuda() self.value_net.cuda() self.device = 'cuda' else: self.device: str = 'cpu'
def test_q_network_with_actions(): seed_everything() training_steps = 500 learning_rate = 0.05 main_net = create_sequential_model(layers_spec=(64, ), num_inputs=1, num_outputs=64, activation_function='relu', final_activation=True, dropout_rate=0) q_network = QNetwork(main_net=main_net, final_layer_neurons=64, num_outputs=2, duelling=True) loss_fn = loss_functions['mse']() optimiser = optimisers['adam'](params=q_network.parameters(), lr=learning_rate) examples = ( (np.array([0]), np.array([0, -1])), (np.array([0.1]), np.array([0, -1])), (np.array([0.2]), np.array([0, 1])), (np.array([0.3]), np.array([0, -1])), (np.array([0.4]), np.array([0, -1])), ) x = torch.from_numpy( np.vstack([x_ for x_, y_ in examples]).astype(np.float32)) y = torch.from_numpy( np.vstack([y_ for x_, y_ in examples]).astype(np.float32)) for i in range(training_steps): # pick some pretend 'actions' to optimise like we will in training random_actions = torch.from_numpy(np.random.randint(0, 2, 5)) y_pred = q_network(x) action_preds = y_pred.gather(1, random_actions.unsqueeze(1)) targets = y.gather(1, random_actions.unsqueeze(1)) loss = loss_fn(action_preds, targets) optimiser.zero_grad() loss.backward() optimiser.step() assert loss < 1e-3, 'Q net with actions doesn\'t converge!'
def test_simple_q_network(): seed_everything() training_steps = 1000 learning_rate = 0.01 main_net = create_sequential_model(layers_spec=(64, ), num_inputs=1, num_outputs=64, activation_function='relu', final_activation=True, dropout_rate=0) q_network = QNetwork(main_net=main_net, final_layer_neurons=64, num_outputs=2, duelling=False) loss_fn = loss_functions['mse']() optimiser = optimisers['adam'](params=q_network.parameters(), lr=learning_rate) examples = ( (np.array([0]), np.array([0, -1])), (np.array([0.1]), np.array([0, -1])), (np.array([0.2]), np.array([0, 1])), (np.array([0.3]), np.array([0, -1])), (np.array([0.4]), np.array([0, -1])), ) # x = x = torch.from_numpy( np.vstack([x_ for x_, y_ in examples]).astype(np.float32)) y = torch.from_numpy( np.vstack([y_ for x_, y_ in examples]).astype(np.float32)) for i in range(training_steps): y_pred = q_network(x) loss = loss_fn(y_pred, y) optimiser.zero_grad() loss.backward() optimiser.step() assert loss < 1e-3, 'Plain Q net doesn\'t converge!'
env = gym.make(args.gymenv) loss_fn = loss_functions[args.loss_fn] optimiser = optimisers[args.optimiser] # set seeds if args.seed is not None: torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) layers_spec = tuple(int(x) for x in args.layers_spec.split('_')) net = create_sequential_model(num_inputs=env.observation_space.shape[0], layers_spec=layers_spec, num_outputs=env.action_space.n, dropout_rate=0, activation_function='relu', final_activation=False) agent = VPGAgent( learning_rate=args.lr, discount_rate=args.discount, policy_net=net, random_seed=args.seed, loss_fn=loss_fn, optimiser=optimiser, cuda=args.cuda, ) trainer = Trainer(env=env, agent=agent,
def __init__(self, learning_rate: float, discount_rate: float, action_space_dim: int, observation_space_dim: int, value_net_layer_spec: Tuple, final_layer_neurons: int, target_update_steps: int, loss_fn, optimiser, random_seed: int = None, duelling: bool = True, gradient_clipping_value=None, gradient_clipping_threshold=None, gradient_clipping_norm=None, cuda: bool = False, train_mode: bool = False, start_epsilon: float = 1, end_epsilon: float = 0.01, epsilon_decay_steps: int = 10000, eval_mode: bool = False, **kwargs): if random_seed is not None: self.seed(random_seed) self.discount_rate = discount_rate main_net = create_sequential_model(num_inputs=observation_space_dim, layers_spec=value_net_layer_spec, num_outputs=final_layer_neurons, dropout_rate=0, activation_function='relu', final_activation=True) self.q_network = QNetwork(main_net, final_layer_neurons, action_space_dim, duelling=duelling) self.target_network = QNetwork(copy.deepcopy(main_net), final_layer_neurons, action_space_dim, duelling=duelling) self.loss_fn = loss_fn() self.optimiser = optimiser(params=self.q_network.parameters(), lr=learning_rate) self.gradient_clipping_value = gradient_clipping_value self.gradient_clipping_norm = gradient_clipping_norm self.gradient_clipping_threshold = gradient_clipping_threshold self.train_mode: bool = train_mode # if true, use epsilon-greedy exploration self.target_update_steps: int = target_update_steps # how often to update the target net (every n backward passes) self.num_backward_passes: int = 0 # counter to know when to update target net self.num_target_net_updates: int = 0 # counter to know when to update target net self.num_training_steps: int = 0 # counter to know how many training steps we've taken self.start_epsilon = start_epsilon self.end_epsilon = end_epsilon self.epsilon_decay_steps = epsilon_decay_steps self.epsilon = start_epsilon self.name = 'DQN' # todo: make this more re-usable/ structured self.eval_mode = eval_mode # if true, use fully greedy policy (no epsilon-randomness) self.possible_actions = np.arange( action_space_dim) # used for action sampling and masking self.update_target_network() if cuda: self.q_network.cuda() self.target_network.cuda() self.device = 'cuda' else: self.device: str = 'cpu'