def dyke_dqn_agent(env: TFPyEnvironment,
                   layers: Optional[List[Layer]] = None) -> DqnAgent:
    """
	Prepares a deep Q-network (DQN) agent for use in the dyke maintenance environment.

	:param env: The dyke environment on which to base the DQN agent.
	:param layers: Optional. A list of layers to supply to the DQN agent's network.
	:return: The agent.
	"""
    layers = fully_connected_dyke_dqn_agent_network(
        sizes=(100, 50)) if layers is None else layers
    # prepare the Q-values layer
    action_as: BoundedArraySpec = from_spec(env.action_spec())
    number_actions: int = int(action_as.maximum - action_as.minimum + 1)
    q_values_layer: Layer = Dense(units=number_actions,
                                  activation=None,
                                  kernel_initializer=RandomUniform(
                                      minval=-3e-3, maxval=3e-3),
                                  bias_initializer=Constant(-2e-1))
    net = Sequential(layers=layers + [q_values_layer])
    # instantiate and return the agent
    optimizer = Adam(learning_rate=1e-3)
    train_step_counter = Variable(initial_value=0)
    return DqnAgent(time_step_spec=env.time_step_spec(),
                    action_spec=env.action_spec(),
                    q_network=net,
                    optimizer=optimizer,
                    epsilon_greedy=0.1,
                    td_errors_loss_fn=element_wise_squared_loss,
                    train_step_counter=train_step_counter)
Exemple #2
0
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="CartPole-v1",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    actor_net = ActorDistributionNetwork(environment.observation_spec(),
                                         environment.action_spec(),
                                         fc_layer_params=(200, 100))
    value_net = ValueNetwork(environment.observation_spec(),
                             fc_layer_params=(200, 100))
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = PPOClipAgent(  # should be closer to the paper than PPOAgent...
        environment.time_step_spec(),
        environment.action_spec(),
        optimizer=tf.compat.v1.train.AdamOptimizer(
        ),  # default None does not work
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=True,
        lambda_value=0.5,
        discount_factor=0.95,
        train_step_counter=global_step,
    )

    agent_trainer = OnPolicyModelFreeAgentTrainer(400)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
def test_incorrect_termination_model():
    """
    The generic model-based agent should only allow a ConstantFalseTermination model.
    """

    # setup arguments for the model-based agent constructor
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    network = LinearTransitionNetwork(observation_spec)
    transition_model = KerasTransitionModel([network], observation_spec, action_spec)
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)
    termination_model = MountainCarTermination(observation_spec)
    policy = RandomTFPolicy(time_step_spec, action_spec)

    with pytest.raises(AssertionError) as excinfo:
        ModelBasedAgent(
            time_step_spec,
            action_spec,
            transition_model,
            reward_model,
            termination_model,
            initial_state_distribution_model,
            policy,
            policy,
        )

    assert "Only constant false termination supported" in str(excinfo.value)
Exemple #4
0
def test_unknown_transition_model():
    """
    Pets Agent has prespecified transition model, RuntimeError should raise on unknown model.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # trajectory optimiser
    trajectory_optimiser_type = TrajectoryOptimizationType.CrossEntropyMethod
    transition_model_type = "unknown_model"
    trajectory_sampler_type = TrajectorySamplerType.TS1

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    learning_rate = 0.9
    max_iterations = 5
    population_size = num_elites + 10
    number_of_particles = 1
    horizon = 1

    with pytest.raises(RuntimeError) as excinfo:
        PetsAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            trajectory_optimiser_type,
            horizon,
            population_size,
            number_of_particles,
            num_elites,
            learning_rate,
            max_iterations,
        )

    assert "Unknown transition model" in str(excinfo.value)
Exemple #5
0
def test_ensemble_size_set_correctly():
    """
    For ensemble transition models ensemble size needs to be larger than 1.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)

    # transition model and model-free agent
    transition_model_type = TransitionModelType.DeterministicEnsemble
    trajectory_sampler_type = TrajectorySamplerType.TS1
    model_free_agent_type = ModelFreeAgentType.Ppo

    # some parameters need to be set correctly
    ensemble_size = 1
    population_size = 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    with pytest.raises(AssertionError) as excinfo:
        MbpoAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            horizon,
            population_size,
            model_free_agent_type,
            1,
            10,
            tf.nn.relu,
            2,
            1,
        )

    assert "ensemble_size should be > 1" in str(excinfo.value)
Exemple #6
0
def test_unknown_transition_model():
    """
    Mepo Agent has prespecified transition model, RuntimeError should raise on unknown model.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)

    # transition model and model-free agent
    transition_model_type = "unknown_model"
    trajectory_sampler_type = TrajectorySamplerType.TS1
    model_free_agent_type = ModelFreeAgentType.Ppo

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    with pytest.raises(RuntimeError) as excinfo:
        MbpoAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            horizon,
            population_size,
            model_free_agent_type,
            1,
            10,
            tf.nn.relu,
            2,
            1,
        )

    assert "Unknown transition model" in str(excinfo.value)
Exemple #7
0
def _create_environment_and_policy(batch_size):
    tf_batched_environment = TFPyEnvironment(
        BatchedPyEnvironment([
            PyEnvironmentMock(final_state=TRAJECTORY_LENGTH)
            for _ in range(batch_size)
        ]))
    policy = TFPolicyMock(
        tf_batched_environment.time_step_spec(),
        tf_batched_environment.action_spec(),
        batch_size=batch_size,
    )

    return tf_batched_environment, policy
Exemple #8
0
def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
def generic_dqn_agent(env: TFPyEnvironment) -> (dqn_agent.DqnAgent, q_network.QNetwork):
    """ Function that returns a generic dqn agent
    args:
        env (TFPyEnvironment) : The environment the agent will live in

    Returns:
        dqn_agent.DqnAgent: The agent to train
        q_network.QNetwork: The network used in the agent
    """

    inp = env.observation_spec().shape[0]
    q_net = q_network.QNetwork(
      env.observation_spec(),
      env.action_spec(),
      fc_layer_params=(20,20,20,20,20),
      activation_fn=tf.keras.activations.relu)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)

    agent = dqn_agent.DqnAgent(
      env.time_step_spec(),
      env.action_spec(),
      q_network=q_net,
      optimizer=optimizer,
      td_errors_loss_fn=common.element_wise_squared_loss,
      train_step_counter=tf.Variable(0),
      epsilon_greedy=0.1
    )

    """def observation_and_action_constraint_splitter(observation):
        action_mask = [1,1]
        if observation[0][-1] > 5:
            action_mask[0] = 1
        return observation, tf.convert_to_tensor(action_mask, dtype=np.int32)

    agent.policy._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter
    )"""
    #tf_agents.policies.greedy_policy.GreedyPolicy

    agent.initialize()

    return agent, q_net
Exemple #10
0
conv_layer_params = [(32,(8,8),4), (64,(4,4), 2), (64, (3,3), 1)]
fc_layer_params = [512]

q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params)

train_step = tf.Variable(0)
update_period = 4 # train the model every 4 steps
optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=1.0, decay_steps= 250000, end_learning_rate=0.01)

agent = DqnAgent(tf_env.time_step_spec(),
                    tf_env.action_spec(),
                    q_network=q_net,
                    optimizer=optimizer,
                    target_update_period=2000,
                    td_errors_loss_fn=keras.losses.Huber(reduction='none'),
                    gamma=0.99,
                    train_step_counter=train_step,
                    epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=850000)
Exemple #11
0
    initial_learning_rate=0.7,
    decay_steps=total_steps,
    end_learning_rate=0.001,
)

# 3. Constructing the DQN Agent.
optimizer = Yogi(learning_rate=0.00025)
loss = Huber()
n_steps = 3
tau = 0.001
gamma = 0.99
min_q = -200
max_q = 200

agent = CategoricalDqnAgent(
    time_step_spec=train_env.time_step_spec(),
    action_spec=train_env.action_spec(),
    categorical_q_network=online_q_net,
    optimizer=optimizer,
    min_q_value=min_q,
    max_q_value=max_q,
    epsilon_greedy=lambda: decay_epsilon_greedy(train_step),
    n_step_update=n_steps,
    target_categorical_q_network=target_q_net,
    target_update_tau=tau,
    target_update_period=1,
    td_errors_loss_fn=loss,
    gamma=gamma,
    train_step_counter=train_step
)
agent.initialize()
    suite_gym.load(env_name,
                   max_episode_steps=max_episode_steps_eval,
                   gym_env_wrappers=[ShrinkWrapper, DiscreteActionWrapper]))

# create DQN (deep Q-Learning network)
q_net = QNetwork(train_env.observation_spec(),
                 train_env.action_spec(),
                 conv_layer_params=conv_layer_params,
                 fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

# create deep reinforcement learning agent
tf_agent = DqnAgent(train_env.time_step_spec(),
                    train_env.action_spec(),
                    q_network=q_net,
                    optimizer=optimizer,
                    td_errors_loss_fn=element_wise_squared_loss,
                    train_step_counter=train_step_counter)
tf_agent.initialize()

# create evaluation and data collection policies
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy

# create replay buffer
print("Creating replay buffer")
replay_buffer = TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec,
                                      batch_size=train_env.batch_size,
Exemple #13
0
class DQNAgent:
    def __init__(self) -> None:
        """
        A class for training a TF-agent
        based on https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
        """

        self.train_env = None  # Training environment
        self.agent = None  # The algorithm used to solve an RL problem is represented by a TF-Agent
        self.replay_buffer = None  # The replay buffer keeps track of data collected from the environment
        self.dataset = None  # The agent needs access to the replay buffer via an iterable tf.data.Dataset
        self.iterator = None  # The iterator of self.dataset

    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, epochs: int, batch_size: int, eval_step: int, log_step: int,
            collect_steps_per_episode: int) -> None:
        """
        Starts the training of the Agent.

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            epochs: Number of epochs to train Agent
            batch_size: The Batch Size
            eval_step: Evaluate Model each 'eval_step'
            log_step: Monitor results of model each 'log_step'
            collect_steps_per_episode: Collect a few steps using collect_policy and save to the replay buffer.
        """

        self.dataset = self.replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)

        def collect_step(environment, policy, buffer):
            time_step = environment.current_time_step()
            action_step = policy.action(time_step)
            next_time_step = environment.step(action_step.action)
            traj = trajectory.from_transition(time_step, action_step, next_time_step)

            # Add trajectory to the replay buffer
            buffer.add_batch(traj)

        def collect_data(env, policy, buffer, steps):
            for _ in range(steps):
                collect_step(env, policy, buffer)

        # (Optional) Optimize by wrapping some of the code in a graph using TF function.
        self.agent.train = common.function(self.agent.train)

        # Reset the train step
        self.agent.train_step_counter.assign(0)

        for _ in range(epochs):
            #print("epoch: ", _)
            # Collect a few steps using collect_policy and save to the replay buffer.
            collect_data(self.train_env, self.agent.collect_policy, self.replay_buffer, collect_steps_per_episode)

            # Sample a batch of data from the buffer and update the agent's network.
            experience, _ = next(self.iterator)
            train_loss = self.agent.train(experience).loss

            step = self.agent.train_step_counter.numpy()

            if step % log_step == 0:
                print('step = {0}: loss = {1}'.format(step, train_loss))

            if step % eval_step == 0:
                metrics = self.compute_metrics(X_train, y_train)
                print(metrics)

    def compute_metrics(self, X: np.ndarray, y_true: list) -> dict:
        """Compute Metrics for Evaluation"""
        # TODO: apply softmax layer for q logits?

        q, _ = self.agent._target_q_network (X, training=False)

        # y_scores = np.max(q.numpy(), axis=1)  # predicted scores (Q-Values)
        y_pred = np.argmax(q.numpy(), axis=1)  # predicted class label

        metrics = custom_metrics(y_true, y_pred)

        return metrics

    def evaluate(self, X: np.ndarray, y: list, X_train=None, y_train=None) -> dict:
        """
         Evaluation of trained Q-network
        """
        metrics = self.compute_metrics(X, y)

        print("evaluation: ", metrics)
        return metrics
def initialize_tf_agent(model_class: ABCMeta,
                        train_env: TFPyEnvironment) -> TFAgent:
    optimizer = Adam(learning_rate=1e-3)

    if model_class in [agents.PPOAgent]:
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=(200, 100),
            activation_fn=tf.keras.activations.tanh,
        )
        value_net = value_network.ValueNetwork(
            train_env.observation_spec(),
            fc_layer_params=(200, 100),
            activation_fn=tf.keras.activations.tanh,
        )
        model = model_class(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            actor_net=actor_net,
            value_net=value_net,
            optimizer=optimizer,
        )
    elif model_class in [agents.DqnAgent]:
        action_spec = train_env.action_spec()
        num_actions = action_spec.maximum - action_spec.minimum + 1
        q_network = create_feedforward_network(fc_layer_units=(100, ),
                                               num_actions=num_actions)
        model = model_class(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            q_network=q_network,
            optimizer=optimizer,
        )
    elif model_class in [agents.ReinforceAgent]:
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.time_step_spec().observation,
            train_env.action_spec(),
            fc_layer_params=(100, ))
        model = model_class(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            actor_network=actor_net,
            optimizer=optimizer,
        )
    elif model_class in [agents.SacAgent]:
        time_step_spec = train_env.time_step_spec()
        observation_spec = time_step_spec.observation
        action_spec = train_env.action_spec()
        critic_joint_fc_layers = (256, 256)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=(256, 256),
            continuous_projection_net=TanhNormalProjectionNetwork,
        )
        critic_net = critic_network.CriticNetwork(
            (observation_spec, action_spec),
            joint_fc_layer_params=critic_joint_fc_layers,
            kernel_initializer="glorot_uniform",
            last_kernel_initializer="glorot_uniform",
        )
        model = agents.SacAgent(
            time_step_spec,
            action_spec,
            actor_network=actor_net,
            critic_network=critic_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4),
        )
    else:
        raise ValueError(
            f"Class of class `{model_class.__name__}` is not supported")
    model.initialize()
    return model
def main(_):
    # Environment
    env_name = "Breakout-v4"
    train_num_parallel_environments = 5
    max_steps_per_episode = 1000
    # Replay buffer
    replay_buffer_capacity = 50000
    init_replay_buffer = 500
    # Driver
    collect_steps_per_iteration = 1 * train_num_parallel_environments
    # Training
    train_batch_size = 32
    train_iterations = 100000
    train_summary_interval = 200
    train_checkpoint_interval = 200
    # Evaluation
    eval_num_parallel_environments = 5
    eval_summary_interval = 500
    eval_num_episodes = 20
    # File paths
    path = pathlib.Path(__file__)
    parent_dir = path.parent.resolve()
    folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S")
    train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint")
    train_summary_dir = str(parent_dir / folder_name / "train_summary")
    eval_summary_dir = str(parent_dir / folder_name / "eval_summary")

    # Parallel training environment
    tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * train_num_parallel_environments))
    tf_env.seed([42] * tf_env.batch_size)
    tf_env.reset()

    # Parallel evaluation environment
    eval_tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * eval_num_parallel_environments))
    eval_tf_env.seed([42] * eval_tf_env.batch_size)
    eval_tf_env.reset()

    # Creating the Deep Q-Network
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    # Creating the DQN Agent
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)

    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=2500000,
        end_learning_rate=0.01)  # final ε

    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=200,
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=global_step,
        epsilon_greedy=lambda: epsilon_fn(global_step))
    agent.initialize()

    # Creating the Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    # Observer: Replay Buffer Observer
    replay_buffer_observer = replay_buffer.add_batch

    # Observer: Training Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size),
    ]

    # Creating the Collect Driver
    collect_driver = DynamicStepDriver(tf_env,
                                       agent.collect_policy,
                                       observers=[replay_buffer_observer] +
                                       train_metrics,
                                       num_steps=collect_steps_per_iteration)

    # Initialize replay buffer
    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer_observer,
                   ShowProgress()],
        num_steps=init_replay_buffer)
    final_time_step, final_policy_state = init_driver.run()

    # Creating the Dataset
    dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    # Optimize by wrapping some of the code in a graph using TF function.
    collect_driver.run = function(collect_driver.run)
    agent.train = function(agent.train)

    print("\n\n++++++++++++++++++++++++++++++++++\n")

    # Create checkpoint
    train_checkpointer = Checkpointer(
        ckpt_dir=train_checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        # replay_buffer=replay_buffer,
        global_step=global_step,
        # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')
    )

    # Restore checkpoint
    # train_checkpointer.initialize_or_restore()

    # Summary writers and metrics
    train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
    eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir)
    eval_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size,
                                       buffer_size=eval_num_episodes),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes)
    ]

    # Create evaluate callback function
    eval_callback = evaluate(eval_metrics=eval_metrics,
                             eval_tf_env=eval_tf_env,
                             eval_policy=agent.policy,
                             eval_num_episodes=eval_num_episodes,
                             train_step=global_step,
                             eval_summary_writer=eval_summary_writer)

    # Train agent
    train_agent(tf_env=tf_env,
                train_iterations=train_iterations,
                global_step=global_step,
                agent=agent,
                dataset=dataset,
                collect_driver=collect_driver,
                train_metrics=train_metrics,
                train_checkpointer=train_checkpointer,
                train_checkpoint_interval=train_checkpoint_interval,
                train_summary_writer=train_summary_writer,
                train_summary_interval=train_summary_interval,
                eval_summary_interval=eval_summary_interval,
                eval_callback=eval_callback)

    print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
Exemple #16
0
if __name__ == '__main__':
    # Create global step counter
    global_step = tf.compat.v1.train.get_or_create_global_step()

    # Create a dummy environment with no policy, just to extract the specs
    dummy_env = TFPyEnvironment(NineMensMorris(None, discount=DISCOUNT))

    # Create Q Network
    q_net = QNetwork(input_tensor_spec=dummy_env.observation_spec(),
                     action_spec=dummy_env.action_spec(),
                     fc_layer_params=(100, 600, 600, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 600, 600),
                     dropout_layer_params=(None, 0.1, 0.1, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.1, None))

    # Create agent
    agent = DdqnAgent(time_step_spec=dummy_env.time_step_spec(),
                      action_spec=dummy_env.action_spec(),
                      q_network=q_net,
                      optimizer=Adam(learning_rate=1e-4),
                      td_errors_loss_fn=common.element_wise_squared_loss,
                      epsilon_greedy=0.1,
                      train_step_counter=global_step)
    # Initialize agent
    agent.initialize()
    # Wrap the training function in a TF graph
    agent.train = common.function(agent.train)

    # Create game environments: training and evaluation
    train_env = TFPyEnvironment(NineMensMorris(agent.policy, discount=DISCOUNT))
    eval_env = TFPyEnvironment(NineMensMorris(agent.policy, discount=DISCOUNT))
Exemple #17
0
class TrainDDQN():
    """Wrapper for DDQN training, validation, saving etc."""
    def __init__(self,
                 episodes: int,
                 warmup_steps: int,
                 learning_rate: float,
                 gamma: float,
                 min_epsilon: float,
                 decay_episodes: int,
                 model_path: str = None,
                 log_dir: str = None,
                 batch_size: int = 64,
                 memory_length: int = None,
                 collect_steps_per_episode: int = 1,
                 val_every: int = None,
                 target_update_period: int = 1,
                 target_update_tau: float = 1.0,
                 progressbar: bool = True,
                 n_step_update: int = 1,
                 gradient_clipping: float = 1.0,
                 collect_every: int = 1) -> None:
        """
        Wrapper to make training easier.
        Code is partly based of https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial

        :param episodes: Number of training episodes
        :type  episodes: int
        :param warmup_steps: Number of episodes to fill Replay Buffer with random state-action pairs before training starts
        :type  warmup_steps: int
        :param learning_rate: Learning Rate for the Adam Optimizer
        :type  learning_rate: float
        :param gamma: Discount factor for the Q-values
        :type  gamma: float
        :param min_epsilon: Lowest and final value for epsilon
        :type  min_epsilon: float
        :param decay_episodes: Amount of episodes to decay from 1 to `min_epsilon`
        :type  decay_episodes: int
        :param model_path: Location to save the trained model
        :type  model_path: str
        :param log_dir: Location to save the logs, usefull for TensorBoard
        :type  log_dir: str
        :param batch_size: Number of samples in minibatch to train on each step
        :type  batch_size: int
        :param memory_length: Maximum size of the Replay Buffer
        :type  memory_length: int
        :param collect_steps_per_episode: Amount of data to collect for Replay Buffer each episiode
        :type  collect_steps_per_episode: int
        :param collect_every: Step interval to collect data during training
        :type  collect_every: int
        :param val_every: Validate the model every X episodes using the `collect_metrics()` function
        :type  val_every: int
        :param target_update_period: Update the target Q-network every X episodes
        :type  target_update_period: int
        :param target_update_tau: Parameter for softening the `target_update_period`
        :type  target_update_tau: float
        :param progressbar: Enable or disable the progressbar for collecting data and training
        :type  progressbar: bool

        :return: None
        :rtype: NoneType
        """
        self.episodes = episodes  # Total episodes
        self.warmup_steps = warmup_steps  # Amount of warmup steps before training
        self.batch_size = batch_size  # Batch size of Replay Memory
        self.collect_steps_per_episode = collect_steps_per_episode  # Amount of steps to collect data each episode
        self.collect_every = collect_every  # Step interval to collect data during training
        self.learning_rate = learning_rate  # Learning Rate
        self.gamma = gamma  # Discount factor
        self.min_epsilon = min_epsilon  # Minimal chance of choosing random action
        self.decay_episodes = decay_episodes  # Number of episodes to decay from 1.0 to `EPSILON`
        self.target_update_period = target_update_period  # Period for soft updates
        self.target_update_tau = target_update_tau
        self.progressbar = progressbar  # Enable or disable the progressbar for collecting data and training
        self.n_step_update = n_step_update
        self.gradient_clipping = gradient_clipping  # Clip the loss
        self.compiled = False
        NOW = datetime.now().strftime("%Y%m%d_%H%M%S")

        if memory_length is not None:
            self.memory_length = memory_length  # Max Replay Memory length
        else:
            self.memory_length = warmup_steps

        if val_every is not None:
            self.val_every = val_every  # Validate the policy every `val_every` episodes
        else:
            self.val_every = self.episodes // min(
                50, self.episodes
            )  # Can't validate the model 50 times if self.episodes < 50

        if model_path is not None:
            self.model_path = model_path
        else:
            self.model_path = "./models/" + NOW + ".pkl"

        if log_dir is None:
            log_dir = "./logs/" + NOW
        self.writer = tf.summary.create_file_writer(log_dir)

    def compile_model(self,
                      X_train,
                      y_train,
                      layers: list = [],
                      imb_ratio: float = None,
                      loss_fn=common.element_wise_squared_loss) -> None:
        """Initializes the neural networks, DDQN-agent, collect policies and replay buffer.

        :param X_train: Training data for the model.
        :type  X_train: np.ndarray
        :param y_train: Labels corresponding to `X_train`.  1 for the positive class, 0 for the negative class.
        :param y_train: np.ndarray
        :param layers: List of layers to feed into the TF-agents custom Sequential(!) layer.
        :type  layers: list
        :param imb_ratio: The imbalance ratio of the data.
        :type  imb_ratio: float
        :param loss_fn: Callable loss function
        :type  loss_fn: tf.compat.v1.losses

        :return: None
        :rtype: NoneType
        """
        if imb_ratio is None:
            imb_ratio = imbalance_ratio(y_train)

        self.train_env = TFPyEnvironment(
            ClassifierEnv(X_train, y_train, imb_ratio))
        self.global_episode = tf.Variable(
            0, name="global_episode", dtype=np.int64,
            trainable=False)  # Global train episode counter

        # Custom epsilon decay: https://github.com/tensorflow/agents/issues/339
        epsilon_decay = tf.compat.v1.train.polynomial_decay(
            1.0,
            self.global_episode,
            self.decay_episodes,
            end_learning_rate=self.min_epsilon)

        self.q_net = Sequential(layers, self.train_env.observation_spec())

        self.agent = DdqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=self.q_net,
            optimizer=Adam(learning_rate=self.learning_rate),
            td_errors_loss_fn=loss_fn,
            train_step_counter=self.global_episode,
            target_update_period=self.target_update_period,
            target_update_tau=self.target_update_tau,
            gamma=self.gamma,
            epsilon_greedy=epsilon_decay,
            n_step_update=self.n_step_update,
            gradient_clipping=self.gradient_clipping)
        self.agent.initialize()

        self.random_policy = RandomTFPolicy(self.train_env.time_step_spec(),
                                            self.train_env.action_spec())
        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=self.memory_length)

        self.warmup_driver = DynamicStepDriver(
            self.train_env,
            self.random_policy,
            observers=[self.replay_buffer.add_batch],
            num_steps=self.warmup_steps)  # Uses a random policy

        self.collect_driver = DynamicStepDriver(
            self.train_env,
            self.agent.collect_policy,
            observers=[self.replay_buffer.add_batch],
            num_steps=self.collect_steps_per_episode
        )  # Uses the epsilon-greedy policy of the agent

        self.agent.train = common.function(self.agent.train)  # Optimalization
        self.warmup_driver.run = common.function(self.warmup_driver.run)
        self.collect_driver.run = common.function(self.collect_driver.run)

        self.compiled = True

    def train(self, *args) -> None:
        """Starts the training of the model. Includes warmup period, metrics collection and model saving.

        :param *args: All arguments will be passed to `collect_metrics()`.
            This can be usefull to pass callables, testing environments or validation data.
            Overwrite the TrainDDQN.collect_metrics() function to use your own *args.
        :type  *args: Any

        :return: None
        :rtype: NoneType, last step is saving the model as a side-effect
        """
        assert self.compiled, "Model must be compiled with model.compile_model(X_train, y_train, layers) before training."

        # Warmup period, fill memory with random actions
        if self.progressbar:
            print(
                f"\033[92mCollecting data for {self.warmup_steps:_} steps... This might take a few minutes...\033[0m"
            )

        self.warmup_driver.run(
            time_step=None,
            policy_state=self.random_policy.get_initial_state(
                self.train_env.batch_size))

        if self.progressbar:
            print(
                f"\033[92m{self.replay_buffer.num_frames():_} frames collected!\033[0m"
            )

        dataset = self.replay_buffer.as_dataset(
            sample_batch_size=self.batch_size,
            num_steps=self.n_step_update + 1,
            num_parallel_calls=data.experimental.AUTOTUNE).prefetch(
                data.experimental.AUTOTUNE)
        iterator = iter(dataset)

        def _train():
            experiences, _ = next(iterator)
            return self.agent.train(experiences).loss

        _train = common.function(_train)  # Optimalization

        ts = None
        policy_state = self.agent.collect_policy.get_initial_state(
            self.train_env.batch_size)
        self.collect_metrics(*args)  # Initial collection for step 0
        pbar = tqdm(total=self.episodes,
                    disable=(not self.progressbar),
                    desc="Training the DDQN")  # TQDM progressbar
        for _ in range(self.episodes):
            if not self.global_episode % self.collect_every:
                # Collect a few steps using collect_policy and save to `replay_buffer`
                if self.collect_steps_per_episode != 0:
                    ts, policy_state = self.collect_driver.run(
                        time_step=ts, policy_state=policy_state)
                pbar.update(
                    self.collect_every
                )  # More stable TQDM updates, collecting could take some time

            # Sample a batch of data from `replay_buffer` and update the agent's network
            train_loss = _train()

            if not self.global_episode % self.val_every:
                with self.writer.as_default():
                    tf.summary.scalar("train_loss",
                                      train_loss,
                                      step=self.global_episode)

                self.collect_metrics(*args)
        pbar.close()

    def collect_metrics(self,
                        X_val: np.ndarray,
                        y_val: np.ndarray,
                        save_best: str = None):
        """Collects metrics using the trained Q-network.

        :param X_val: Features of validation data, same shape as X_train
        :type  X_val: np.ndarray
        :param y_val: Labels of validation data, same shape as y_train
        :type  y_val: np.ndarray
        :param save_best: Saving the best model of all validation runs based on given metric:
            Choose one of: {Gmean, F1, Precision, Recall, TP, TN, FP, FN}
            This improves stability since the model at the last episode is not guaranteed to be the best model.
        :type  save_best: str
        """
        y_pred = network_predictions(self.agent._target_q_network, X_val)
        stats = classification_metrics(y_val, y_pred)
        avgQ = np.mean(decision_function(self.agent._target_q_network,
                                         X_val))  # Max action for each x in X

        if save_best is not None:
            if not hasattr(self, "best_score"):  # If no best model yet
                self.best_score = 0.0

            if stats.get(save_best) >= self.best_score:  # Overwrite best model
                self.save_network(
                )  # Saving directly to avoid shallow copy without trained weights
                self.best_score = stats.get(save_best)

        with self.writer.as_default():
            tf.summary.scalar(
                "AverageQ", avgQ,
                step=self.global_episode)  # Average Q-value for this epoch
            for k, v in stats.items():
                tf.summary.scalar(k, v, step=self.global_episode)

    def evaluate(self, X_test, y_test, X_train=None, y_train=None):
        """
        Final evaluation of trained Q-network with X_test and y_test.
        Optional PR and ROC curve comparison to X_train, y_train to ensure no overfitting is taking place.

        :param X_test: Features of test data, same shape as X_train
        :type  X_test: np.ndarray
        :param y_test: Labels of test data, same shape as y_train
        :type  y_test: np.ndarray
        :param X_train: Features of train data
        :type  X_train: np.ndarray
        :param y_train: Labels of train data
        :type  y_train: np.ndarray
        """
        if hasattr(self, "best_score"):
            print(f"\033[92mBest score: {self.best_score:6f}!\033[0m")
            network = self.load_network(
                self.model_path)  # Load best saved model
        else:
            network = self.agent._target_q_network  # Load latest target model

        if (X_train is not None) and (y_train is not None):
            plot_pr_curve(network, X_test, y_test, X_train, y_train)
            plot_roc_curve(network, X_test, y_test, X_train, y_train)

        y_pred = network_predictions(network, X_test)
        return classification_metrics(y_test, y_pred)

    def save_network(self):
        """Saves Q-network as pickle to `model_path`."""
        with open(self.model_path, "wb") as f:  # Save Q-network as pickle
            pickle.dump(self.agent._target_q_network, f)

    @staticmethod
    def load_network(fp: str):
        """Static method to load Q-network pickle from given filepath.

        :param fp: Filepath to the saved pickle of the network
        :type  fp: str

        :returns: The network-object loaded from a pickle file.
        :rtype: tensorflow.keras.models.Model
        """
        with open(fp, "rb") as f:  # Load the Q-network
            network = pickle.load(f)
        return network
    initial_learning_rate=0.9,
    decay_steps=total_steps,
    end_learning_rate=0.001,
)

# 3. Constructing the DQN Agent.
optimizer = Yogi(learning_rate=0.00025)
loss = Huber()
n_steps = 3
tau = 0.001
gamma = 0.99
min_q = -200
max_q = 200

agent = CategoricalDqnAgent(
    time_step_spec=eval_env.time_step_spec(),
    action_spec=eval_env.action_spec(),
    categorical_q_network=online_q_net,
    optimizer=optimizer,
    min_q_value=min_q,
    max_q_value=max_q,
    epsilon_greedy=lambda: decay_epsilon_greedy(train_step),
    n_step_update=n_steps,
    target_categorical_q_network=target_q_net,
    target_update_tau=tau,
    target_update_period=1,
    td_errors_loss_fn=loss,
    gamma=gamma,
    train_step_counter=train_step)
agent.initialize()
Exemple #19
0
def create_reinforce_agent(
        env: TFPyEnvironment,
        gamma: float = 0.99,
        agent_name: str = 'reinforce_agent',
        debug: bool = False,
        training_step_counter: Optional[Any] = None,
        agent_params: Optional[Dict[str, Any]] = None) -> ReinforceAgent:
    """
    Function for creating a REINFORCE agent in line with the TensorFlow Agents implementation.
    This function builds an action network and uses this to instantiate the agent which is returned.

    :param env: TensorFlow Environment implementing the ControlledRandomWalk.
    :param gamma: Discount factor.
    :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when
        debugging.
    :param debug: Flag which toggles debugging in the REINFORCE agent.
    :param training_step_counter: An optional counter to increment every time the train op of the
        agent is run. If None if provided it defaults to the global_step.
    :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up.
    :return: An instance of TensorFlow Agents REINFORCE agent.
    """
    # Process the action specification to attain the dimensions of the action subspaces to ensure
    # that in the case that there is only one resource set (and therefore only one action subspace)
    # the tuple of action specifications of length one is replaced by a single action specification.
    # This is to align with the fact that the actor network is implemented to return a tuple of
    # (OneHotCategorical) distributions (one for each resource set) where there are multiple action
    # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise.
    # First attain the action spec.
    action_spec = env.action_spec()

    # Extract the shape of the subspaces from the action specification tuple.
    # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry.
    action_subspace_dimensions = tuple(
        int(subspace.shape[-1]) for subspace in action_spec)

    # Then test if there is only one action subspace.
    if len(action_spec) == 1:
        # Pull out the only action spec.
        action_spec = action_spec[0]

    if agent_params is None:
        agent_params = dict()

    # Set up the action network. See `multi_headed_softmax_policy.py` for details.
    actor_network = MultiHeadedCategoricalActionNetwork(
        input_tensor_spec=env.observation_spec(),
        output_tensor_spec=action_spec,
        action_subspace_dimensions=action_subspace_dimensions,
        hidden_units=agent_params.get('hidden_units', (64, )))
    # Set up the REINFORCE agent in line with standard tf_agents.
    agent = ReinforceAgent(
        time_step_spec=env.time_step_spec(),
        action_spec=action_spec,
        actor_network=actor_network,
        optimizer=tf.compat.v1.train.AdamOptimizer(),
        value_network=agent_params.get('value_network', None),
        value_estimation_loss_coef=agent_params.get(
            'value_estimation_loss_coef', 0.2),
        advantage_fn=agent_params.get('advantage_fn', None),
        use_advantage_loss=agent_params.get('use_advantage_loss', True),
        gamma=gamma,
        normalize_returns=agent_params.get('normalize_returns', True),
        gradient_clipping=agent_params.get('gradient_clipping', None),
        debug_summaries=debug,
        summarize_grads_and_vars=debug,
        entropy_regularization=agent_params.get('entropy_regulariztion', None),
        train_step_counter=training_step_counter,
        name=agent_name)

    return agent
Exemple #20
0
def create_bellman_pets_agent(
        env: TFPyEnvironment,
        agent_name: str = 'PETS_Agent',
        debug: bool = False,  # REQUIRED?
        reward_model_class: RewardModel = None,
        initial_state_distribution_model_class:
    InitialStateDistributionModel = None,
        training_step_counter: Optional[Any] = None,
        agent_params: Optional[Dict[str, Any]] = None) -> PetsAgent:
    """
    Function for creating a Bellman PETS agent in line with the Bellman
    implementation.
    This function builds an action network and uses this to instantiate the agent which is returned.

    :param env: TensorFlow Environment implementing the ControlledRandomWalk.
    :param num_epochs: Number of epochs for computing policy updates.
    :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when
        debugging.
    :param debug: Flag which toggles debugging in the PETS agent.
    :param reward_model_class: CRWRewardModel, dummy variable, currently extracted from env
    :param initial_state_distribution_model_class: CRWStateInitialiser, dummy variable, currently extracted from env
    :param training_step_counter: An optional counter to increment every time the train op of the
        agent is run. If None if provided it defaults to the global_step.
    :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up.
    :return: An instance of Bellman PETS agent.
    """
    # Process the action specification to attain the dimensions of the action subspaces to ensure
    # that in the case that there is only one resource set (and therefore only one action subspace)
    # the tuple of action specifications of length one is replaced by a single action specification.
    # This is to align with the fact that the actor network is implemented to return a tuple of
    # (OneHotCategorical) distributions (one for each resource set) where there are multiple action
    # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise.
    # First attain the action spec.
    # action_spec = env.action_spec()

    # Extract the shape of the subspaces from the action specification tuple.
    # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry.
    # action_subspace_dimensions = tuple(int(subspace.shape[-1]) for subspace in action_spec)

    # # Then test if there is only one action subspace.
    # if len(action_spec) == 1:
    #     # Pull out the only action spec.
    #     action_spec = action_spec[0]

    if agent_params is None:
        agent_params = dict()

    callbacks = [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)]

    # initializing given MDP components
    # NOTE: hacked for the time being, using quantities directly from the environment for now
    reward_model = reward_model_class(env.observation_spec(),
                                      env.action_spec(), env)
    initial_state_distribution_model = initial_state_distribution_model_class(
        env)

    # Set up the PETS agent in line with Bellman toolbox
    agent = PetsAgent(
        time_step_spec=env.time_step_spec(),
        action_spec=env.action_spec(),
        transition_model_type=agent_params.get(
            'transition_model_type',
            TransitionModelType.DeterministicEnsemble),
        num_hidden_layers=agent_params.get('num_hidden_layers', 3),
        num_hidden_nodes=agent_params.get('num_hidden_nodes', 250),
        activation_function=agent_params.get('activation_function',
                                             tf.nn.relu),
        ensemble_size=agent_params.get('ensemble_size', 5),
        predict_state_difference=agent_params.get('predict_state_difference',
                                                  True),
        epochs=agent_params.get('epochs', 100),
        training_batch_size=agent_params.get('training_batch_size', 32),
        callbacks=agent_params.get('callbacks', callbacks),
        reward_model=reward_model,
        initial_state_distribution_model=initial_state_distribution_model,
        trajectory_sampler_type=gent_params.get('trajectory_sampler_type',
                                                TrajectorySamplerType.TS1),
        trajectory_optimization_type=agent_params.get(
            'trajectory_optimization_type',
            TrajectoryOptimizationType.RandomShooting),
        horizon=agent_params.get('horizon', 25),
        population_size=agent_params.get('population_size', 2500),
        number_of_particles=agent_params.get('number_of_particles', 1),
        num_elites=agent_params.get('num_elites', 40),
        learning_rate=agent_params.get('learning_rate', 0.9),
        max_iterations=agent_params.get('max_iterations', 5),
        train_step_counter=training_step_counter,
    )

    return agent
Exemple #21
0
def create_ppo_agent(
        env: TFPyEnvironment,
        num_epochs: int = 10,
        gamma: float = 0.99,
        agent_name: str = 'PPO_Agent',
        debug: bool = False,
        training_step_counter: Optional[Any] = None,
        agent_params: Optional[Dict[str, Any]] = None) -> PPOAgent:
    """
    Function for creating a Proximal Policy Optimisation agent in line with the TensorFlow Agents
    implementation.
    This function builds an action network and uses this to instantiate the agent which is returned.

    :param env: TensorFlow Environment implementing the ControlledRandomWalk.
    :param num_epochs: Number of epochs for computing policy updates.
    :param gamma: Discount factor.
    :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when
        debugging.
    :param debug: Flag which toggles debugging in the PPO agent.
    :param training_step_counter: An optional counter to increment every time the train op of the
        agent is run. If None if provided it defaults to the global_step.
    :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up.
    :return: An instance of TensorFlow Agents PPO agent.
    """
    # Process the action specification to attain the dimensions of the action subspaces to ensure
    # that in the case that there is only one resource set (and therefore only one action subspace)
    # the tuple of action specifications of length one is replaced by a single action specification.
    # This is to align with the fact that the actor network is implemented to return a tuple of
    # (OneHotCategorical) distributions (one for each resource set) where there are multiple action
    # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise.
    # First attain the action spec.
    action_spec = env.action_spec()

    # Extract the shape of the subspaces from the action specification tuple.
    # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry.
    action_subspace_dimensions = tuple(
        int(subspace.shape[-1]) for subspace in action_spec)

    # Then test if there is only one action subspace.
    if len(action_spec) == 1:
        # Pull out the only action spec.
        action_spec = action_spec[0]

    if agent_params is None:
        agent_params = dict()

    # Set up the action network. See `multi_headed_softmax_policy.py` for details.
    actor_network = MultiHeadedCategoricalActionNetwork(
        input_tensor_spec=env.observation_spec(),
        output_tensor_spec=action_spec,
        action_subspace_dimensions=action_subspace_dimensions,
        hidden_units=agent_params.get('hidden_units', (64, )))

    # PPO Requires a value network, we set one up using the default tf_agents set up.
    value_network = tf_agents.networks.value_network.ValueNetwork(
        env.observation_spec(),
        fc_layer_params=agent_params.get('value_fc_layer_params', (128, 64)),
        activation_fn=agent_params.get('value_net_activation_fn', tf.nn.tanh))

    # Set up the PPO agent in line with standard tf_agents.
    agent = PPOAgent(
        time_step_spec=env.time_step_spec(),
        action_spec=action_spec,
        actor_net=actor_network,
        optimizer=tf.compat.v1.train.AdamOptimizer(
            agent_params.get('learning_rate', 0.001)),
        value_net=value_network,
        importance_ratio_clipping=agent_params.get('importance_ratio_clipping',
                                                   0.0),
        lambda_value=agent_params.get('lambda_value', 0.95),
        discount_factor=gamma,
        policy_l2_reg=agent_params.get('policy_l2_reg', 0.0),
        value_function_l2_reg=agent_params.get('value_function_l2_reg', 0.0),
        value_pred_loss_coef=agent_params.get('value_pred_loss_coef', 0.5),
        num_epochs=num_epochs,
        use_gae=agent_params.get('use_gae', False),
        use_td_lambda_return=agent_params.get('use_td_lambda_return', False),
        normalize_rewards=agent_params.get('normalise_rewards', True),
        reward_norm_clipping=agent_params.get('reward_norm_clipping', 10),
        kl_cutoff_factor=agent_params.get('kl_cutoff_factor', 2.0),
        kl_cutoff_coef=agent_params.get('kl_cutoff_coef', 1000),
        initial_adaptive_kl_beta=agent_params.get('initial_adaptive_kl_beta',
                                                  1.0),
        adaptive_kl_target=agent_params.get('adaptive_kl_target', 0.01),
        adaptive_kl_tolerance=agent_params.get('adaptive_kl_tolerance', 0.3),
        normalize_observations=agent_params.get('normalize_observations',
                                                True),
        gradient_clipping=agent_params.get('gradient_clipping', None),
        debug_summaries=debug,
        summarize_grads_and_vars=debug,
        check_numerics=agent_params.get('check_numerics', False),
        entropy_regularization=agent_params.get('entropy_regularization', 0.0),
        train_step_counter=training_step_counter,
        name=agent_name)

    return agent
Exemple #22
0
target_update_period = 5
learning_rate = 1e-3
n_step_update = 1
gamma = 0.99
gradient_clipping = None
reward_scale_factor = 1.0
debug_summaries = False
summarize_grads_and_vars = False

q_net = q_network.QNetwork(
    tf_env.observation_spec(), tf_env.action_spec(), fc_layer_params=fc_layer_params
)


tf_agent = dqn_agent.DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    epsilon_greedy=epsilon_greedy,
    n_step_update=n_step_update,
    target_update_tau=target_update_tau,
    target_update_period=target_update_period,
    optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate),
    td_errors_loss_fn=common.element_wise_squared_loss,
    gamma=gamma,
    reward_scale_factor=reward_scale_factor,
    gradient_clipping=gradient_clipping,
    debug_summaries=debug_summaries,
    summarize_grads_and_vars=summarize_grads_and_vars,
    train_step_counter=global_step,
)
#                                     epsilon=0.00001, centered=True)

train_step = tf.Variable(0)
update_period = 4  # run a training step every 4 collect steps
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4,
                                                decay=0.95,
                                                momentum=0.0,
                                                epsilon=0.00001,
                                                centered=True)
epsilon_fn = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0,  # initial ε
    decay_steps=250000 // update_period,  # <=> 1,000,000 ALE frames
    end_learning_rate=0.01)  # final ε

agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    target_update_period=2000,  # <=> 32,000 ALE frames
    td_errors_loss_fn=tf.keras.losses.Huber(reduction="none"),
    gamma=0.99,  # discount factor
    train_step_counter=train_step,
    epsilon_greedy=lambda: epsilon_fn(train_step))

agent.initialize()

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=1000000)
Exemple #24
0
        [create_env] * 4
    )
    train_env = TFPyEnvironment(parallel_env)
    # train_env = TFPyEnvironment(suite_gym.load(env_name))
    eval_env = TFPyEnvironment(suite_gym.load(env_name))

    fc_layer_params = (100,)
    q_net = QNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        fc_layer_params=fc_layer_params
    )
    train_step_counter = tf.Variable(0)

    agent = DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=Adam(learning_rate=LEARNING_RATE),
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter
    )
    agent.initialize()

    random_policy = RandomTFPolicy(
        train_env.time_step_spec(),
        train_env.action_spec()
    )

    def compute_avg_return(environment, policy, num_episodes=10):
        total_return = 0
Exemple #25
0
    #print(trajectory.reward)
    
prev_lives = tf_env.pyenv.envs[0].ale.lives()
def reset_and_fire_on_life_lost(trajectory):
    global prev_lives
    lives = tf_env.pyenv.envs[0].ale.lives()
    if prev_lives != lives:
        #tf_env.reset()
        tf_env.pyenv.envs[0].step(np.array(1, dtype=np.int32))
        prev_lives = lives

policy_num = sys.argv[1]

#print(type(agent))
saved_policy = tf.compat.v2.saved_model.load(f'policy_{policy_num}')
saved_policy.time_step_spec =  tf_env.time_step_spec()
saved_policy.action_spec = tf_env.action_spec()
saved_policy.policy_state_spec = () # tf_env.policy_state_spec
saved_policy.info_spec = ()
saved_policy.emit_log_probability = True

saved_policy = EpsilonGreedyPolicy(saved_policy, epsilon=0.005)

#saved_policy = tf_agents.policies.gaussian_policy.GaussianPolicy(saved_policy)

#agent = tf.saved_model.load('policy_100')
#agent = tf.keras.models.load_model('policy_100')
#agent = tf.keras.models.load_model('policy_100')
#policy = tf.saved_model.load('')
#print(type(agent))
tf_env.pyenv.envs[0].step(np.array(1, dtype=np.int32))
Exemple #26
0
def test_all_mepo_variants_work(transition_model, trajectory_sampler,
                                model_free_agent_type):
    """
    Mepo Agent has prespecified transition model, trajectory sampler and model-free agent
    types. Here we check that all combinations execute without errors.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    agent = MepoAgent(
        time_step_spec,
        action_spec,
        transition_model,
        1,
        10,
        tf.nn.relu,
        ensemble_size,
        False,
        1,
        1,
        [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
        reward_model,
        initial_state_distribution_model,
        trajectory_sampler,
        horizon,
        population_size,
        model_free_agent_type,
        1,
        10,
        tf.nn.relu,
        2,
    )

    # we need some training data
    random_policy = RandomTFPolicy(
        time_step_spec,
        action_spec,
        info_spec=agent.collect_policy.info_spec,
    )
    model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        random_policy.trajectory_spec, batch_size=1, max_length=1000)
    collect_driver_random_policy = TFDriver(
        tf_env,
        random_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    initial_time_step = tf_env.reset()
    collect_driver_random_policy.run(initial_time_step)
    pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10)
    tf_training_scheduler = pets_agent_trainer.create_training_scheduler(
        agent, model_training_buffer)
    training_losses = tf_training_scheduler.maybe_train(
        tf.constant(10, dtype=tf.int64))
    assert EnvironmentModelComponents.TRANSITION in training_losses

    # test the agent
    collect_driver_planning_policy = TFDriver(
        tf_env,
        agent.collect_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
Exemple #27
0
def breakout_v4(seed=42):
    env = suite_gym.load("Breakout-v4")
    env.seed(seed)
    env.reset()

    repeating_env = ActionRepeat(env, times=4)
    for name in dir(tf_agents.environments.wrappers):
        obj = getattr(tf_agents.environments.wrappers, name)
        if hasattr(obj, "__base__") and issubclass(
                obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper):
            print("{:27s} {}".format(name, obj.__doc__.split("\n")[0]))

    limited_repeating_env = suite_gym.load(
        "Breakout-v4",
        gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)],
        env_wrappers=[partial(ActionRepeat, times=4)],
    )

    max_episode_steps = 27000  # <=> 108k ALE frames since 1 step = 4 frames
    environment_name = "BreakoutNoFrameskip-v4"

    env = suite_atari.load(
        environment_name,
        max_episode_steps=max_episode_steps,
        gym_env_wrappers=[AtariPreprocessing, FrameStack4],
    )

    env.seed(42)
    env.reset()
    time_step = env.step(np.array(1))  # FIRE
    for _ in range(4):
        time_step = env.step(np.array(3))  # LEFT

    def plot_observation(obs):
        # Since there are only 3 color channels, you cannot display 4 frames
        # with one primary color per frame. So this code computes the delta between
        # the current frame and the mean of the other frames, and it adds this delta
        # to the red and blue channels to get a pink color for the current frame.
        obs = obs.astype(np.float32)
        img_ = obs[..., :3]
        current_frame_delta = np.maximum(
            obs[..., 3] - obs[..., :3].mean(axis=-1), 0.0)
        img_[..., 0] += current_frame_delta
        img_[..., 2] += current_frame_delta
        img_ = np.clip(img_ / 150, 0, 1)
        plt.imshow(img_)
        plt.axis("off")

    plt.figure(figsize=(6, 6))
    plot_observation(time_step.observation)
    plt.tight_layout()
    plt.savefig("./images/preprocessed_breakout_plot.png",
                format="png",
                dpi=300)
    plt.show()

    tf_env = TFPyEnvironment(env)

    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.0)
    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        preprocessing_layers=preprocessing_layer,
        conv_layer_params=conv_layer_params,
        fc_layer_params=fc_layer_params,
    )

    # see TF-agents issue #113
    # optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
    #                                     epsilon=0.00001, centered=True)

    train_step = tf.Variable(0)
    update_period = 4  # run a training step every 4 collect steps
    optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4,
                                                    decay=0.95,
                                                    momentum=0.0,
                                                    epsilon=0.00001,
                                                    centered=True)
    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=250000 // update_period,  # <=> 1,000,000 ALE frames
        end_learning_rate=0.01,
    )  # final ε
    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=2000,  # <=> 32,000 ALE frames
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=train_step,
        epsilon_greedy=lambda: epsilon_fn(train_step),
    )
    agent.initialize()

    from tf_agents.replay_buffers import tf_uniform_replay_buffer

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000000)

    replay_buffer_observer = replay_buffer.add_batch

    class ShowProgress:
        def __init__(self, total):
            self.counter = 0
            self.total = total

        def __call__(self, trajectory):
            if not trajectory.is_boundary():
                self.counter += 1
            if self.counter % 100 == 0:
                print("\r{}/{}".format(self.counter, self.total), end="")

    from tf_agents.metrics import tf_metrics

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    from tf_agents.eval.metric_utils import log_metrics
    import logging

    logging.getLogger().setLevel(logging.INFO)
    log_metrics(train_metrics)

    from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

    collect_driver = DynamicStepDriver(
        tf_env,
        agent.collect_policy,
        observers=[replay_buffer_observer] + train_metrics,
        num_steps=update_period,
    )  # collect 4 steps for each training iteration

    from tf_agents.policies.random_tf_policy import RandomTFPolicy

    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer.add_batch,
                   ShowProgress(20000)],
        num_steps=20000,
    )  # <=> 80,000 ALE frames
    final_time_step, final_policy_state = init_driver.run()
Exemple #28
0
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="Pendulum-v0",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    critic_network = CriticNetwork(
        input_tensor_spec=(environment.observation_spec(),
                           environment.action_spec()),
        observation_fc_layer_params=None,
        action_fc_layer_params=None,
        joint_fc_layer_params=(200, 100),
    )
    actor_network = ActorNetwork(
        input_tensor_spec=environment.observation_spec(),
        output_tensor_spec=environment.action_spec(),
        fc_layer_params=(200, 100),
    )
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DdpgAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        critic_network=critic_network,
        actor_network=actor_network,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(),
        train_step_counter=global_step,
    )

    agent_trainer = OffPolicyModelFreeAgentTrainer(1, 256)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
    # Create a global step
    global_step = tf.compat.v1.train.get_or_create_global_step()

    # Create the actor network (with the normal distribution)
    actor_net = ActorDistributionNetwork(
        input_tensor_spec=train_env.observation_spec(),
        output_tensor_spec=train_env.action_spec(),
        fc_layer_params=(128, 256, 512, 512, 256),
        continuous_projection_net=normal_net)

    # Create the value network
    value_net = ValueNetwork(input_tensor_spec=train_env.observation_spec(),
                             fc_layer_params=(256, 512, 512))

    # Create the PPO agent
    ppo_agent = PPOClipAgent(time_step_spec=train_env.time_step_spec(),
                             action_spec=train_env.action_spec(),
                             optimizer=Adam(learning_rate=5e-4),
                             actor_net=actor_net,
                             value_net=value_net,
                             importance_ratio_clipping=0.2,
                             discount_factor=0.95,
                             entropy_regularization=0.0,
                             num_epochs=16,
                             use_gae=True,
                             use_td_lambda_return=True,
                             log_prob_clipping=3,
                             gradient_clipping=0.5,
                             train_step_counter=global_step)
    # Initialize the agent
    ppo_agent.initialize()