Ejemplo n.º 1
0
    def __init__(
        self,
        discount_rate: float,
        action_space_dim: int,
        observation_space_dim: int,
        policy_net_layers_spec: Tuple,
        value_net_layers_spec: Tuple,
        loss_fn,
        optimiser,
        random_seed: int = None,
        cuda: bool = False,
        learning_rate: float = None,
        policy_learning_rate: float = None,
        value_learning_rate: float = None,
    ):
        """ `learning_rate` is a default for the policy and value learning rates, if those aren't specified."""
        if random_seed is not None:
            self.seed(random_seed)

        self.name = 'AC'

        # fallback to default ( easier compatibility with DQN)
        policy_learning_rate = policy_learning_rate or learning_rate
        value_learning_rate = value_learning_rate or learning_rate

        self.discount_rate = discount_rate
        self.policy_net = create_sequential_model(
            num_inputs=observation_space_dim,
            layers_spec=policy_net_layers_spec,
            num_outputs=action_space_dim,
            dropout_rate=0,
            activation_function='relu',
            final_activation=False)
        self.value_net = create_sequential_model(
            num_inputs=observation_space_dim,
            layers_spec=value_net_layers_spec,
            num_outputs=1,
            dropout_rate=0,
            activation_function='relu',
            final_activation=False)
        self.loss_fn = loss_fn()
        self.policy_optimiser = optimiser(params=self.policy_net.parameters(),
                                          lr=policy_learning_rate)
        self.value_optimiser = optimiser(params=self.value_net.parameters(),
                                         lr=value_learning_rate)
        if cuda:
            self.policy_net.cuda()
            self.value_net.cuda()
            self.device = 'cuda'
        else:
            self.device: str = 'cpu'
Ejemplo n.º 2
0
def test_q_network_with_actions():
    seed_everything()
    training_steps = 500
    learning_rate = 0.05

    main_net = create_sequential_model(layers_spec=(64, ),
                                       num_inputs=1,
                                       num_outputs=64,
                                       activation_function='relu',
                                       final_activation=True,
                                       dropout_rate=0)
    q_network = QNetwork(main_net=main_net,
                         final_layer_neurons=64,
                         num_outputs=2,
                         duelling=True)

    loss_fn = loss_functions['mse']()
    optimiser = optimisers['adam'](params=q_network.parameters(),
                                   lr=learning_rate)

    examples = (
        (np.array([0]), np.array([0, -1])),
        (np.array([0.1]), np.array([0, -1])),
        (np.array([0.2]), np.array([0, 1])),
        (np.array([0.3]), np.array([0, -1])),
        (np.array([0.4]), np.array([0, -1])),
    )

    x = torch.from_numpy(
        np.vstack([x_ for x_, y_ in examples]).astype(np.float32))
    y = torch.from_numpy(
        np.vstack([y_ for x_, y_ in examples]).astype(np.float32))

    for i in range(training_steps):
        # pick some pretend 'actions' to optimise like we will in training
        random_actions = torch.from_numpy(np.random.randint(0, 2, 5))

        y_pred = q_network(x)
        action_preds = y_pred.gather(1, random_actions.unsqueeze(1))

        targets = y.gather(1, random_actions.unsqueeze(1))

        loss = loss_fn(action_preds, targets)

        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    assert loss < 1e-3, 'Q net with actions doesn\'t converge!'
Ejemplo n.º 3
0
def test_simple_q_network():
    seed_everything()

    training_steps = 1000
    learning_rate = 0.01

    main_net = create_sequential_model(layers_spec=(64, ),
                                       num_inputs=1,
                                       num_outputs=64,
                                       activation_function='relu',
                                       final_activation=True,
                                       dropout_rate=0)
    q_network = QNetwork(main_net=main_net,
                         final_layer_neurons=64,
                         num_outputs=2,
                         duelling=False)

    loss_fn = loss_functions['mse']()
    optimiser = optimisers['adam'](params=q_network.parameters(),
                                   lr=learning_rate)

    examples = (
        (np.array([0]), np.array([0, -1])),
        (np.array([0.1]), np.array([0, -1])),
        (np.array([0.2]), np.array([0, 1])),
        (np.array([0.3]), np.array([0, -1])),
        (np.array([0.4]), np.array([0, -1])),
    )

    # x =
    x = torch.from_numpy(
        np.vstack([x_ for x_, y_ in examples]).astype(np.float32))
    y = torch.from_numpy(
        np.vstack([y_ for x_, y_ in examples]).astype(np.float32))

    for i in range(training_steps):
        y_pred = q_network(x)
        loss = loss_fn(y_pred, y)

        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    assert loss < 1e-3, 'Plain Q net doesn\'t converge!'
Ejemplo n.º 4
0
env = gym.make(args.gymenv)

loss_fn = loss_functions[args.loss_fn]
optimiser = optimisers[args.optimiser]

# set seeds
if args.seed is not None:
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

layers_spec = tuple(int(x) for x in args.layers_spec.split('_'))
net = create_sequential_model(num_inputs=env.observation_space.shape[0],
                              layers_spec=layers_spec,
                              num_outputs=env.action_space.n,
                              dropout_rate=0,
                              activation_function='relu',
                              final_activation=False)

agent = VPGAgent(
    learning_rate=args.lr,
    discount_rate=args.discount,
    policy_net=net,
    random_seed=args.seed,
    loss_fn=loss_fn,
    optimiser=optimiser,
    cuda=args.cuda,
)

trainer = Trainer(env=env,
                  agent=agent,
Ejemplo n.º 5
0
    def __init__(self,
                 learning_rate: float,
                 discount_rate: float,
                 action_space_dim: int,
                 observation_space_dim: int,
                 value_net_layer_spec: Tuple,
                 final_layer_neurons: int,
                 target_update_steps: int,
                 loss_fn,
                 optimiser,
                 random_seed: int = None,
                 duelling: bool = True,
                 gradient_clipping_value=None,
                 gradient_clipping_threshold=None,
                 gradient_clipping_norm=None,
                 cuda: bool = False,
                 train_mode: bool = False,
                 start_epsilon: float = 1,
                 end_epsilon: float = 0.01,
                 epsilon_decay_steps: int = 10000,
                 eval_mode: bool = False,
                 **kwargs):

        if random_seed is not None:
            self.seed(random_seed)

        self.discount_rate = discount_rate
        main_net = create_sequential_model(num_inputs=observation_space_dim,
                                           layers_spec=value_net_layer_spec,
                                           num_outputs=final_layer_neurons,
                                           dropout_rate=0,
                                           activation_function='relu',
                                           final_activation=True)
        self.q_network = QNetwork(main_net,
                                  final_layer_neurons,
                                  action_space_dim,
                                  duelling=duelling)
        self.target_network = QNetwork(copy.deepcopy(main_net),
                                       final_layer_neurons,
                                       action_space_dim,
                                       duelling=duelling)
        self.loss_fn = loss_fn()
        self.optimiser = optimiser(params=self.q_network.parameters(),
                                   lr=learning_rate)
        self.gradient_clipping_value = gradient_clipping_value
        self.gradient_clipping_norm = gradient_clipping_norm
        self.gradient_clipping_threshold = gradient_clipping_threshold
        self.train_mode: bool = train_mode  # if true, use epsilon-greedy exploration

        self.target_update_steps: int = target_update_steps  # how often to update the target net (every n backward passes)
        self.num_backward_passes: int = 0  # counter to know when to update target net
        self.num_target_net_updates: int = 0  # counter to know when to update target net
        self.num_training_steps: int = 0  # counter to know how many training steps we've taken

        self.start_epsilon = start_epsilon
        self.end_epsilon = end_epsilon
        self.epsilon_decay_steps = epsilon_decay_steps

        self.epsilon = start_epsilon

        self.name = 'DQN'  # todo: make this more re-usable/ structured

        self.eval_mode = eval_mode  # if true, use fully greedy policy (no epsilon-randomness)

        self.possible_actions = np.arange(
            action_space_dim)  # used for action sampling and masking

        self.update_target_network()

        if cuda:
            self.q_network.cuda()
            self.target_network.cuda()
            self.device = 'cuda'
        else:
            self.device: str = 'cpu'