Beispiel #1
0
def test_transform_tf(atari_model: MuZeroTFModelV2):
    t = tf.convert_to_tensor([[0.75, 4.5, 3], [1.7, 2, 3.2]], dtype=tf.float32)
    u = atari_model.transform(t)
    v = atari_model.untransform(u)
    w = atari_model.transform(v)
    assert tf.experimental.numpy.allclose(t, v, atol=1e-3)
    assert tf.experimental.numpy.allclose(u, w, atol=1e-3)
Beispiel #2
0
def test_transform_np(atari_model: MuZeroTFModelV2):
    t = np.array([[0.75, 4.5, 3], [1.7, 2, 3.2]], dtype='float32')
    u = atari_model.transform(t)
    v = atari_model.untransform(u)
    w = atari_model.transform(v)
    assert np.allclose(t, v, atol=1e-3)
    assert np.allclose(u, w, atol=1e-3)
Beispiel #3
0
def test_scalar_to_categorical(atari_model: MuZeroTFModelV2,
                               atari_config: dict):
    t = np.array([[0.75, 4.5, 3], [1.7, 2, 3.2]], dtype='float32')
    u = atari_model.scalar_to_categorical(t, atari_config['value_max'])
    v = atari_model.expectation(u, atari_model.value_basis)
    w = atari_model.scalar_to_categorical(v, atari_config['value_max'])
    assert tf.experimental.numpy.allclose(t, v)
    assert tf.experimental.numpy.allclose(u, w)
Beispiel #4
0
def test_forward(atari_tensor_spec, atari_model: MuZeroTFModelV2):
    batch_size = 3
    t = random_obs(atari_tensor_spec[0],
                   batch_size=batch_size,
                   frames_per_obs=32)

    # Training.
    value, policy = atari_model.forward(t, is_training=True)
    assert value.shape == (batch_size, )
    assert policy.shape == (batch_size, atari_model.action_space_size)

    # Not training.
    value, policy = atari_model.forward(t, is_training=False)
    assert value.shape == (batch_size, )
    assert policy.shape == (batch_size, atari_model.action_space_size)
Beispiel #5
0
def test_encode_actions(atari_model: MuZeroTFModelV2):
    t = atari_model._encode_atari_actions([0, 1])
    assert t.shape == (2, 6, 6, 4)
    assert tf.experimental.numpy.allclose(t[0, :, :, 0], tf.ones((6, 6)))
    assert tf.experimental.numpy.allclose(t[0, :, :, 1], tf.zeros((6, 6)))
    assert tf.experimental.numpy.allclose(t[0, :, :, 2], tf.zeros((6, 6)))
    assert tf.experimental.numpy.allclose(t[0, :, :, 3], tf.zeros((6, 6)))
Beispiel #6
0
 def one_run(env: gym.Env):
     env.seed(random_seed)
     env.action_space.seed(random_seed)
     model = MuZeroTFModelV2(env.observation_space, env.action_space,
                             CARTPOLE_DEFAULT_CONFIG)
     obs = env.reset()
     obs = np.expand_dims(obs, axis=0)
     return eval_model(model, obs)
Beispiel #7
0
    def prepare_batch(
        self,
        policy: MuZeroTFPolicy,
        model: MuZeroTFModelV2,
        dist_class: type,  # e.g., ray.rllib.models.tf.tf_action_dist.Categorical
        train_batch: Dict[str, TensorType]
    ) -> TensorType:
        obs = train_batch[SampleBatch.CUR_OBS]
        actions = train_batch[SampleBatch.ACTIONS]

        reward_preds = []
        value_preds = []
        policy_preds = []
        hidden_state = model.representation(obs)
        for i in range(policy.loss_steps):
            value, action_probs = model.prediction(hidden_state)
            value_preds.append(value)
            policy_preds.append(action_probs)
            hidden_state, reward = model.dynamics(hidden_state, actions[:, i])
            hidden_state = scale_gradient(hidden_state, 0.5)
            reward_preds.append(reward)

        reward_preds = tf.transpose(tf.convert_to_tensor(reward_preds),
                                    perm=(1, 0, 2))
        value_preds = tf.transpose(tf.convert_to_tensor(value_preds),
                                   perm=(1, 0, 2))

        if model.action_type == MuZeroTFModelV2.ATARI:
            value_loss_fn = self.atari_value_loss
            reward_loss_fn = self.atari_reward_loss
        elif model.action_type == MuZeroTFModelV2.BOARD:
            value_loss_fn = self.board_value_loss
            reward_loss_fn = self.board_reward_loss
        else:
            raise NotImplemented(f'action type "{model.action_type}" unknown')

        # Save the loss statistics in an object belonging to the policy.
        # The stats function will use it to return training statistics.
        return (reward_loss_fn, value_loss_fn, self.policy_loss_fn,
                train_batch['rollout_rewards'], train_batch['rollout_values'],
                train_batch['rollout_policies'], reward_preds, value_preds,
                policy_preds, train_batch[PRIO_WEIGHTS])
Beispiel #8
0
 def eval_model(model: MuZeroTFModelV2, obs: np.ndarray):
     hidden_state = model.representation_net(obs)
     value1, policy1 = model.prediction_net(hidden_state)
     value1 = model.expectation(value1, model.value_basis).numpy()[0]
     policy1 = policy1.numpy()[0].tolist()
     next_state, reward = model.dynamics(hidden_state, [action])
     value2, policy2 = model.prediction_net(next_state)
     value2 = model.expectation(value2, model.value_basis).numpy()[0]
     reward = model.expectation(reward, model.reward_basis).numpy()[0]
     policy2 = policy2.numpy()[0].tolist()
     return hidden_state.numpy().tolist(
     ), value1, policy1, reward, next_state.numpy().tolist(
     ), value2, policy2
Beispiel #9
0
def cartpole_mcts_optimized(config: Dict[str, Any]):
    env = wrap_cartpole(gym.make('CartPole-v0'))
    env.seed(config['random_seed'])
    env.action_space.seed(config['random_seed'])
    current_observation = env.reset()
    current_observation = np.expand_dims(current_observation, axis=0)

    cartpole_model = MuZeroTFModelV2(env.observation_space, env.action_space,
                                     config)

    mcts = MCTS(cartpole_model, config, random_seed=config['random_seed'])
    values, policies, actions, roots = mcts.compute_action(current_observation,
                                                           debug=True)

    assert len(roots) == 1
    return roots[0]
Beispiel #10
0
def cartpole_mcts_unoptimized(network_config: Dict[str, Any]) -> Node:
    numpy.random.seed(network_config['random_seed'])
    env = wrap_cartpole(gym.make('CartPole-v0'))
    env.seed(network_config['random_seed'])
    env.action_space.seed(network_config['random_seed'])
    network = Network(env, network_config)
    config = make_cartpole_config(network_config)
    model = MuZeroTFModelV2(env.observation_space, env.action_space,
                            network_config)

    root = Node(0)
    current_observation = env.reset()
    action_history = ActionHistory([], 2)
    expand_node(root, Player(), action_history.action_space(),
                network.initial_inference(current_observation))
    add_exploration_noise(config, root)

    # We then run a Monte Carlo Tree Search using only action sequences and the
    # model learned by the network.
    run_mcts(config, root, action_history, network)
    return root
Beispiel #11
0
def make_mu_zero_model(policy: Policy, obs_space: gym.spaces.Space,
                       action_space: gym.spaces.Space,
                       config: TrainerConfigDict) -> MuZeroTFModelV2:
    return MuZeroTFModelV2(obs_space, action_space, config)
Beispiel #12
0
 def atari_scalar_loss(cls, target: TensorType, output: TensorType,
                       bound: int) -> TensorType:
     target = MuZeroTFModelV2.scalar_to_categorical(target, bound)
     return cls.cross_entropy(target, output)
Beispiel #13
0
def atari_model(atari_config: dict):
    env = wrap_atari(gym.make('BreakoutNoFrameskip-v4'))
    return MuZeroTFModelV2(env.observation_space, env.action_space,
                           atari_config)
Beispiel #14
0
def cartpole_model(cartpole_config):
    env = wrap_cartpole(gym.make('CartPole-v0'))
    return MuZeroTFModelV2(env.observation_space, env.action_space,
                           cartpole_config)
Beispiel #15
0
 def __init__(self, env: gym.Env, config: Dict[str, Any]):
     self.action_space_size = env.action_space.n
     self.model = MuZeroTFModelV2(env.observation_space, env.action_space,
                                  config)