Esempio n. 1
0
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="CartPole-v1",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    actor_net = ActorDistributionNetwork(environment.observation_spec(),
                                         environment.action_spec(),
                                         fc_layer_params=(200, 100))
    value_net = ValueNetwork(environment.observation_spec(),
                             fc_layer_params=(200, 100))
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = PPOClipAgent(  # should be closer to the paper than PPOAgent...
        environment.time_step_spec(),
        environment.action_spec(),
        optimizer=tf.compat.v1.train.AdamOptimizer(
        ),  # default None does not work
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=True,
        lambda_value=0.5,
        discount_factor=0.95,
        train_step_counter=global_step,
    )

    agent_trainer = OnPolicyModelFreeAgentTrainer(400)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
Esempio n. 2
0
def test_incorrect_termination_model():
    """
    The generic model-based agent should only allow a ConstantFalseTermination model.
    """

    # setup arguments for the model-based agent constructor
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    network = LinearTransitionNetwork(observation_spec)
    transition_model = KerasTransitionModel([network], observation_spec, action_spec)
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)
    termination_model = MountainCarTermination(observation_spec)
    policy = RandomTFPolicy(time_step_spec, action_spec)

    with pytest.raises(AssertionError) as excinfo:
        ModelBasedAgent(
            time_step_spec,
            action_spec,
            transition_model,
            reward_model,
            termination_model,
            initial_state_distribution_model,
            policy,
            policy,
        )

    assert "Only constant false termination supported" in str(excinfo.value)
Esempio n. 3
0
def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
Esempio n. 4
0
def generic_dqn_agent(env: TFPyEnvironment) -> (dqn_agent.DqnAgent, q_network.QNetwork):
    """ Function that returns a generic dqn agent
    args:
        env (TFPyEnvironment) : The environment the agent will live in

    Returns:
        dqn_agent.DqnAgent: The agent to train
        q_network.QNetwork: The network used in the agent
    """

    inp = env.observation_spec().shape[0]
    q_net = q_network.QNetwork(
      env.observation_spec(),
      env.action_spec(),
      fc_layer_params=(20,20,20,20,20),
      activation_fn=tf.keras.activations.relu)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)

    agent = dqn_agent.DqnAgent(
      env.time_step_spec(),
      env.action_spec(),
      q_network=q_net,
      optimizer=optimizer,
      td_errors_loss_fn=common.element_wise_squared_loss,
      train_step_counter=tf.Variable(0),
      epsilon_greedy=0.1
    )

    """def observation_and_action_constraint_splitter(observation):
        action_mask = [1,1]
        if observation[0][-1] > 5:
            action_mask[0] = 1
        return observation, tf.convert_to_tensor(action_mask, dtype=np.int32)

    agent.policy._observation_and_action_constraint_splitter = (
        observation_and_action_constraint_splitter
    )"""
    #tf_agents.policies.greedy_policy.GreedyPolicy

    agent.initialize()

    return agent, q_net
Esempio n. 5
0
def test_unknown_transition_model():
    """
    Pets Agent has prespecified transition model, RuntimeError should raise on unknown model.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # trajectory optimiser
    trajectory_optimiser_type = TrajectoryOptimizationType.CrossEntropyMethod
    transition_model_type = "unknown_model"
    trajectory_sampler_type = TrajectorySamplerType.TS1

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    learning_rate = 0.9
    max_iterations = 5
    population_size = num_elites + 10
    number_of_particles = 1
    horizon = 1

    with pytest.raises(RuntimeError) as excinfo:
        PetsAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            trajectory_optimiser_type,
            horizon,
            population_size,
            number_of_particles,
            num_elites,
            learning_rate,
            max_iterations,
        )

    assert "Unknown transition model" in str(excinfo.value)
Esempio n. 6
0
def test_ensemble_size_set_correctly():
    """
    For ensemble transition models ensemble size needs to be larger than 1.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)

    # transition model and model-free agent
    transition_model_type = TransitionModelType.DeterministicEnsemble
    trajectory_sampler_type = TrajectorySamplerType.TS1
    model_free_agent_type = ModelFreeAgentType.Ppo

    # some parameters need to be set correctly
    ensemble_size = 1
    population_size = 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    with pytest.raises(AssertionError) as excinfo:
        MbpoAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            horizon,
            population_size,
            model_free_agent_type,
            1,
            10,
            tf.nn.relu,
            2,
            1,
        )

    assert "ensemble_size should be > 1" in str(excinfo.value)
Esempio n. 7
0
def test_unknown_transition_model():
    """
    Mepo Agent has prespecified transition model, RuntimeError should raise on unknown model.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)

    # transition model and model-free agent
    transition_model_type = "unknown_model"
    trajectory_sampler_type = TrajectorySamplerType.TS1
    model_free_agent_type = ModelFreeAgentType.Ppo

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    with pytest.raises(RuntimeError) as excinfo:
        MbpoAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            horizon,
            population_size,
            model_free_agent_type,
            1,
            10,
            tf.nn.relu,
            2,
            1,
        )

    assert "Unknown transition model" in str(excinfo.value)
Esempio n. 8
0
def initialize_tf_agent(model_class: ABCMeta,
                        train_env: TFPyEnvironment) -> TFAgent:
    optimizer = Adam(learning_rate=1e-3)

    if model_class in [agents.PPOAgent]:
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.observation_spec(),
            train_env.action_spec(),
            fc_layer_params=(200, 100),
            activation_fn=tf.keras.activations.tanh,
        )
        value_net = value_network.ValueNetwork(
            train_env.observation_spec(),
            fc_layer_params=(200, 100),
            activation_fn=tf.keras.activations.tanh,
        )
        model = model_class(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            actor_net=actor_net,
            value_net=value_net,
            optimizer=optimizer,
        )
    elif model_class in [agents.DqnAgent]:
        action_spec = train_env.action_spec()
        num_actions = action_spec.maximum - action_spec.minimum + 1
        q_network = create_feedforward_network(fc_layer_units=(100, ),
                                               num_actions=num_actions)
        model = model_class(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            q_network=q_network,
            optimizer=optimizer,
        )
    elif model_class in [agents.ReinforceAgent]:
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            train_env.time_step_spec().observation,
            train_env.action_spec(),
            fc_layer_params=(100, ))
        model = model_class(
            time_step_spec=train_env.time_step_spec(),
            action_spec=train_env.action_spec(),
            actor_network=actor_net,
            optimizer=optimizer,
        )
    elif model_class in [agents.SacAgent]:
        time_step_spec = train_env.time_step_spec()
        observation_spec = time_step_spec.observation
        action_spec = train_env.action_spec()
        critic_joint_fc_layers = (256, 256)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=(256, 256),
            continuous_projection_net=TanhNormalProjectionNetwork,
        )
        critic_net = critic_network.CriticNetwork(
            (observation_spec, action_spec),
            joint_fc_layer_params=critic_joint_fc_layers,
            kernel_initializer="glorot_uniform",
            last_kernel_initializer="glorot_uniform",
        )
        model = agents.SacAgent(
            time_step_spec,
            action_spec,
            actor_network=actor_net,
            critic_network=critic_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(3e-4),
        )
    else:
        raise ValueError(
            f"Class of class `{model_class.__name__}` is not supported")
    model.initialize()
    return model
Esempio n. 9
0
class DQNAgent:
    def __init__(self) -> None:
        """
        A class for training a TF-agent
        based on https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
        """

        self.train_env = None  # Training environment
        self.agent = None  # The algorithm used to solve an RL problem is represented by a TF-Agent
        self.replay_buffer = None  # The replay buffer keeps track of data collected from the environment
        self.dataset = None  # The agent needs access to the replay buffer via an iterable tf.data.Dataset
        self.iterator = None  # The iterator of self.dataset

    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, epochs: int, batch_size: int, eval_step: int, log_step: int,
            collect_steps_per_episode: int) -> None:
        """
        Starts the training of the Agent.

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            epochs: Number of epochs to train Agent
            batch_size: The Batch Size
            eval_step: Evaluate Model each 'eval_step'
            log_step: Monitor results of model each 'log_step'
            collect_steps_per_episode: Collect a few steps using collect_policy and save to the replay buffer.
        """

        self.dataset = self.replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)

        def collect_step(environment, policy, buffer):
            time_step = environment.current_time_step()
            action_step = policy.action(time_step)
            next_time_step = environment.step(action_step.action)
            traj = trajectory.from_transition(time_step, action_step, next_time_step)

            # Add trajectory to the replay buffer
            buffer.add_batch(traj)

        def collect_data(env, policy, buffer, steps):
            for _ in range(steps):
                collect_step(env, policy, buffer)

        # (Optional) Optimize by wrapping some of the code in a graph using TF function.
        self.agent.train = common.function(self.agent.train)

        # Reset the train step
        self.agent.train_step_counter.assign(0)

        for _ in range(epochs):
            #print("epoch: ", _)
            # Collect a few steps using collect_policy and save to the replay buffer.
            collect_data(self.train_env, self.agent.collect_policy, self.replay_buffer, collect_steps_per_episode)

            # Sample a batch of data from the buffer and update the agent's network.
            experience, _ = next(self.iterator)
            train_loss = self.agent.train(experience).loss

            step = self.agent.train_step_counter.numpy()

            if step % log_step == 0:
                print('step = {0}: loss = {1}'.format(step, train_loss))

            if step % eval_step == 0:
                metrics = self.compute_metrics(X_train, y_train)
                print(metrics)

    def compute_metrics(self, X: np.ndarray, y_true: list) -> dict:
        """Compute Metrics for Evaluation"""
        # TODO: apply softmax layer for q logits?

        q, _ = self.agent._target_q_network (X, training=False)

        # y_scores = np.max(q.numpy(), axis=1)  # predicted scores (Q-Values)
        y_pred = np.argmax(q.numpy(), axis=1)  # predicted class label

        metrics = custom_metrics(y_true, y_pred)

        return metrics

    def evaluate(self, X: np.ndarray, y: list, X_train=None, y_train=None) -> dict:
        """
         Evaluation of trained Q-network
        """
        metrics = self.compute_metrics(X, y)

        print("evaluation: ", metrics)
        return metrics
Esempio n. 10
0
def create_reinforce_agent(
        env: TFPyEnvironment,
        gamma: float = 0.99,
        agent_name: str = 'reinforce_agent',
        debug: bool = False,
        training_step_counter: Optional[Any] = None,
        agent_params: Optional[Dict[str, Any]] = None) -> ReinforceAgent:
    """
    Function for creating a REINFORCE agent in line with the TensorFlow Agents implementation.
    This function builds an action network and uses this to instantiate the agent which is returned.

    :param env: TensorFlow Environment implementing the ControlledRandomWalk.
    :param gamma: Discount factor.
    :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when
        debugging.
    :param debug: Flag which toggles debugging in the REINFORCE agent.
    :param training_step_counter: An optional counter to increment every time the train op of the
        agent is run. If None if provided it defaults to the global_step.
    :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up.
    :return: An instance of TensorFlow Agents REINFORCE agent.
    """
    # Process the action specification to attain the dimensions of the action subspaces to ensure
    # that in the case that there is only one resource set (and therefore only one action subspace)
    # the tuple of action specifications of length one is replaced by a single action specification.
    # This is to align with the fact that the actor network is implemented to return a tuple of
    # (OneHotCategorical) distributions (one for each resource set) where there are multiple action
    # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise.
    # First attain the action spec.
    action_spec = env.action_spec()

    # Extract the shape of the subspaces from the action specification tuple.
    # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry.
    action_subspace_dimensions = tuple(
        int(subspace.shape[-1]) for subspace in action_spec)

    # Then test if there is only one action subspace.
    if len(action_spec) == 1:
        # Pull out the only action spec.
        action_spec = action_spec[0]

    if agent_params is None:
        agent_params = dict()

    # Set up the action network. See `multi_headed_softmax_policy.py` for details.
    actor_network = MultiHeadedCategoricalActionNetwork(
        input_tensor_spec=env.observation_spec(),
        output_tensor_spec=action_spec,
        action_subspace_dimensions=action_subspace_dimensions,
        hidden_units=agent_params.get('hidden_units', (64, )))
    # Set up the REINFORCE agent in line with standard tf_agents.
    agent = ReinforceAgent(
        time_step_spec=env.time_step_spec(),
        action_spec=action_spec,
        actor_network=actor_network,
        optimizer=tf.compat.v1.train.AdamOptimizer(),
        value_network=agent_params.get('value_network', None),
        value_estimation_loss_coef=agent_params.get(
            'value_estimation_loss_coef', 0.2),
        advantage_fn=agent_params.get('advantage_fn', None),
        use_advantage_loss=agent_params.get('use_advantage_loss', True),
        gamma=gamma,
        normalize_returns=agent_params.get('normalize_returns', True),
        gradient_clipping=agent_params.get('gradient_clipping', None),
        debug_summaries=debug,
        summarize_grads_and_vars=debug,
        entropy_regularization=agent_params.get('entropy_regulariztion', None),
        train_step_counter=training_step_counter,
        name=agent_name)

    return agent
Esempio n. 11
0
def main(_):
    # Environment
    env_name = "Breakout-v4"
    train_num_parallel_environments = 5
    max_steps_per_episode = 1000
    # Replay buffer
    replay_buffer_capacity = 50000
    init_replay_buffer = 500
    # Driver
    collect_steps_per_iteration = 1 * train_num_parallel_environments
    # Training
    train_batch_size = 32
    train_iterations = 100000
    train_summary_interval = 200
    train_checkpoint_interval = 200
    # Evaluation
    eval_num_parallel_environments = 5
    eval_summary_interval = 500
    eval_num_episodes = 20
    # File paths
    path = pathlib.Path(__file__)
    parent_dir = path.parent.resolve()
    folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S")
    train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint")
    train_summary_dir = str(parent_dir / folder_name / "train_summary")
    eval_summary_dir = str(parent_dir / folder_name / "eval_summary")

    # Parallel training environment
    tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * train_num_parallel_environments))
    tf_env.seed([42] * tf_env.batch_size)
    tf_env.reset()

    # Parallel evaluation environment
    eval_tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * eval_num_parallel_environments))
    eval_tf_env.seed([42] * eval_tf_env.batch_size)
    eval_tf_env.reset()

    # Creating the Deep Q-Network
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    # Creating the DQN Agent
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)

    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=2500000,
        end_learning_rate=0.01)  # final ε

    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=200,
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=global_step,
        epsilon_greedy=lambda: epsilon_fn(global_step))
    agent.initialize()

    # Creating the Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    # Observer: Replay Buffer Observer
    replay_buffer_observer = replay_buffer.add_batch

    # Observer: Training Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size),
    ]

    # Creating the Collect Driver
    collect_driver = DynamicStepDriver(tf_env,
                                       agent.collect_policy,
                                       observers=[replay_buffer_observer] +
                                       train_metrics,
                                       num_steps=collect_steps_per_iteration)

    # Initialize replay buffer
    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer_observer,
                   ShowProgress()],
        num_steps=init_replay_buffer)
    final_time_step, final_policy_state = init_driver.run()

    # Creating the Dataset
    dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    # Optimize by wrapping some of the code in a graph using TF function.
    collect_driver.run = function(collect_driver.run)
    agent.train = function(agent.train)

    print("\n\n++++++++++++++++++++++++++++++++++\n")

    # Create checkpoint
    train_checkpointer = Checkpointer(
        ckpt_dir=train_checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        # replay_buffer=replay_buffer,
        global_step=global_step,
        # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')
    )

    # Restore checkpoint
    # train_checkpointer.initialize_or_restore()

    # Summary writers and metrics
    train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
    eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir)
    eval_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size,
                                       buffer_size=eval_num_episodes),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes)
    ]

    # Create evaluate callback function
    eval_callback = evaluate(eval_metrics=eval_metrics,
                             eval_tf_env=eval_tf_env,
                             eval_policy=agent.policy,
                             eval_num_episodes=eval_num_episodes,
                             train_step=global_step,
                             eval_summary_writer=eval_summary_writer)

    # Train agent
    train_agent(tf_env=tf_env,
                train_iterations=train_iterations,
                global_step=global_step,
                agent=agent,
                dataset=dataset,
                collect_driver=collect_driver,
                train_metrics=train_metrics,
                train_checkpointer=train_checkpointer,
                train_checkpoint_interval=train_checkpoint_interval,
                train_summary_writer=train_summary_writer,
                train_summary_interval=train_summary_interval,
                eval_summary_interval=eval_summary_interval,
                eval_callback=eval_callback)

    print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
Esempio n. 12
0
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="Pendulum-v0",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    critic_network = CriticNetwork(
        input_tensor_spec=(environment.observation_spec(),
                           environment.action_spec()),
        observation_fc_layer_params=None,
        action_fc_layer_params=None,
        joint_fc_layer_params=(200, 100),
    )
    actor_network = ActorNetwork(
        input_tensor_spec=environment.observation_spec(),
        output_tensor_spec=environment.action_spec(),
        fc_layer_params=(200, 100),
    )
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DdpgAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        critic_network=critic_network,
        actor_network=actor_network,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(),
        train_step_counter=global_step,
    )

    agent_trainer = OffPolicyModelFreeAgentTrainer(1, 256)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
Esempio n. 13
0
class TrainDDQN():
    """Wrapper for DDQN training, validation, saving etc."""
    def __init__(self,
                 episodes: int,
                 warmup_steps: int,
                 learning_rate: float,
                 gamma: float,
                 min_epsilon: float,
                 decay_episodes: int,
                 model_path: str = None,
                 log_dir: str = None,
                 batch_size: int = 64,
                 memory_length: int = None,
                 collect_steps_per_episode: int = 1,
                 val_every: int = None,
                 target_update_period: int = 1,
                 target_update_tau: float = 1.0,
                 progressbar: bool = True,
                 n_step_update: int = 1,
                 gradient_clipping: float = 1.0,
                 collect_every: int = 1) -> None:
        """
        Wrapper to make training easier.
        Code is partly based of https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial

        :param episodes: Number of training episodes
        :type  episodes: int
        :param warmup_steps: Number of episodes to fill Replay Buffer with random state-action pairs before training starts
        :type  warmup_steps: int
        :param learning_rate: Learning Rate for the Adam Optimizer
        :type  learning_rate: float
        :param gamma: Discount factor for the Q-values
        :type  gamma: float
        :param min_epsilon: Lowest and final value for epsilon
        :type  min_epsilon: float
        :param decay_episodes: Amount of episodes to decay from 1 to `min_epsilon`
        :type  decay_episodes: int
        :param model_path: Location to save the trained model
        :type  model_path: str
        :param log_dir: Location to save the logs, usefull for TensorBoard
        :type  log_dir: str
        :param batch_size: Number of samples in minibatch to train on each step
        :type  batch_size: int
        :param memory_length: Maximum size of the Replay Buffer
        :type  memory_length: int
        :param collect_steps_per_episode: Amount of data to collect for Replay Buffer each episiode
        :type  collect_steps_per_episode: int
        :param collect_every: Step interval to collect data during training
        :type  collect_every: int
        :param val_every: Validate the model every X episodes using the `collect_metrics()` function
        :type  val_every: int
        :param target_update_period: Update the target Q-network every X episodes
        :type  target_update_period: int
        :param target_update_tau: Parameter for softening the `target_update_period`
        :type  target_update_tau: float
        :param progressbar: Enable or disable the progressbar for collecting data and training
        :type  progressbar: bool

        :return: None
        :rtype: NoneType
        """
        self.episodes = episodes  # Total episodes
        self.warmup_steps = warmup_steps  # Amount of warmup steps before training
        self.batch_size = batch_size  # Batch size of Replay Memory
        self.collect_steps_per_episode = collect_steps_per_episode  # Amount of steps to collect data each episode
        self.collect_every = collect_every  # Step interval to collect data during training
        self.learning_rate = learning_rate  # Learning Rate
        self.gamma = gamma  # Discount factor
        self.min_epsilon = min_epsilon  # Minimal chance of choosing random action
        self.decay_episodes = decay_episodes  # Number of episodes to decay from 1.0 to `EPSILON`
        self.target_update_period = target_update_period  # Period for soft updates
        self.target_update_tau = target_update_tau
        self.progressbar = progressbar  # Enable or disable the progressbar for collecting data and training
        self.n_step_update = n_step_update
        self.gradient_clipping = gradient_clipping  # Clip the loss
        self.compiled = False
        NOW = datetime.now().strftime("%Y%m%d_%H%M%S")

        if memory_length is not None:
            self.memory_length = memory_length  # Max Replay Memory length
        else:
            self.memory_length = warmup_steps

        if val_every is not None:
            self.val_every = val_every  # Validate the policy every `val_every` episodes
        else:
            self.val_every = self.episodes // min(
                50, self.episodes
            )  # Can't validate the model 50 times if self.episodes < 50

        if model_path is not None:
            self.model_path = model_path
        else:
            self.model_path = "./models/" + NOW + ".pkl"

        if log_dir is None:
            log_dir = "./logs/" + NOW
        self.writer = tf.summary.create_file_writer(log_dir)

    def compile_model(self,
                      X_train,
                      y_train,
                      layers: list = [],
                      imb_ratio: float = None,
                      loss_fn=common.element_wise_squared_loss) -> None:
        """Initializes the neural networks, DDQN-agent, collect policies and replay buffer.

        :param X_train: Training data for the model.
        :type  X_train: np.ndarray
        :param y_train: Labels corresponding to `X_train`.  1 for the positive class, 0 for the negative class.
        :param y_train: np.ndarray
        :param layers: List of layers to feed into the TF-agents custom Sequential(!) layer.
        :type  layers: list
        :param imb_ratio: The imbalance ratio of the data.
        :type  imb_ratio: float
        :param loss_fn: Callable loss function
        :type  loss_fn: tf.compat.v1.losses

        :return: None
        :rtype: NoneType
        """
        if imb_ratio is None:
            imb_ratio = imbalance_ratio(y_train)

        self.train_env = TFPyEnvironment(
            ClassifierEnv(X_train, y_train, imb_ratio))
        self.global_episode = tf.Variable(
            0, name="global_episode", dtype=np.int64,
            trainable=False)  # Global train episode counter

        # Custom epsilon decay: https://github.com/tensorflow/agents/issues/339
        epsilon_decay = tf.compat.v1.train.polynomial_decay(
            1.0,
            self.global_episode,
            self.decay_episodes,
            end_learning_rate=self.min_epsilon)

        self.q_net = Sequential(layers, self.train_env.observation_spec())

        self.agent = DdqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=self.q_net,
            optimizer=Adam(learning_rate=self.learning_rate),
            td_errors_loss_fn=loss_fn,
            train_step_counter=self.global_episode,
            target_update_period=self.target_update_period,
            target_update_tau=self.target_update_tau,
            gamma=self.gamma,
            epsilon_greedy=epsilon_decay,
            n_step_update=self.n_step_update,
            gradient_clipping=self.gradient_clipping)
        self.agent.initialize()

        self.random_policy = RandomTFPolicy(self.train_env.time_step_spec(),
                                            self.train_env.action_spec())
        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=self.memory_length)

        self.warmup_driver = DynamicStepDriver(
            self.train_env,
            self.random_policy,
            observers=[self.replay_buffer.add_batch],
            num_steps=self.warmup_steps)  # Uses a random policy

        self.collect_driver = DynamicStepDriver(
            self.train_env,
            self.agent.collect_policy,
            observers=[self.replay_buffer.add_batch],
            num_steps=self.collect_steps_per_episode
        )  # Uses the epsilon-greedy policy of the agent

        self.agent.train = common.function(self.agent.train)  # Optimalization
        self.warmup_driver.run = common.function(self.warmup_driver.run)
        self.collect_driver.run = common.function(self.collect_driver.run)

        self.compiled = True

    def train(self, *args) -> None:
        """Starts the training of the model. Includes warmup period, metrics collection and model saving.

        :param *args: All arguments will be passed to `collect_metrics()`.
            This can be usefull to pass callables, testing environments or validation data.
            Overwrite the TrainDDQN.collect_metrics() function to use your own *args.
        :type  *args: Any

        :return: None
        :rtype: NoneType, last step is saving the model as a side-effect
        """
        assert self.compiled, "Model must be compiled with model.compile_model(X_train, y_train, layers) before training."

        # Warmup period, fill memory with random actions
        if self.progressbar:
            print(
                f"\033[92mCollecting data for {self.warmup_steps:_} steps... This might take a few minutes...\033[0m"
            )

        self.warmup_driver.run(
            time_step=None,
            policy_state=self.random_policy.get_initial_state(
                self.train_env.batch_size))

        if self.progressbar:
            print(
                f"\033[92m{self.replay_buffer.num_frames():_} frames collected!\033[0m"
            )

        dataset = self.replay_buffer.as_dataset(
            sample_batch_size=self.batch_size,
            num_steps=self.n_step_update + 1,
            num_parallel_calls=data.experimental.AUTOTUNE).prefetch(
                data.experimental.AUTOTUNE)
        iterator = iter(dataset)

        def _train():
            experiences, _ = next(iterator)
            return self.agent.train(experiences).loss

        _train = common.function(_train)  # Optimalization

        ts = None
        policy_state = self.agent.collect_policy.get_initial_state(
            self.train_env.batch_size)
        self.collect_metrics(*args)  # Initial collection for step 0
        pbar = tqdm(total=self.episodes,
                    disable=(not self.progressbar),
                    desc="Training the DDQN")  # TQDM progressbar
        for _ in range(self.episodes):
            if not self.global_episode % self.collect_every:
                # Collect a few steps using collect_policy and save to `replay_buffer`
                if self.collect_steps_per_episode != 0:
                    ts, policy_state = self.collect_driver.run(
                        time_step=ts, policy_state=policy_state)
                pbar.update(
                    self.collect_every
                )  # More stable TQDM updates, collecting could take some time

            # Sample a batch of data from `replay_buffer` and update the agent's network
            train_loss = _train()

            if not self.global_episode % self.val_every:
                with self.writer.as_default():
                    tf.summary.scalar("train_loss",
                                      train_loss,
                                      step=self.global_episode)

                self.collect_metrics(*args)
        pbar.close()

    def collect_metrics(self,
                        X_val: np.ndarray,
                        y_val: np.ndarray,
                        save_best: str = None):
        """Collects metrics using the trained Q-network.

        :param X_val: Features of validation data, same shape as X_train
        :type  X_val: np.ndarray
        :param y_val: Labels of validation data, same shape as y_train
        :type  y_val: np.ndarray
        :param save_best: Saving the best model of all validation runs based on given metric:
            Choose one of: {Gmean, F1, Precision, Recall, TP, TN, FP, FN}
            This improves stability since the model at the last episode is not guaranteed to be the best model.
        :type  save_best: str
        """
        y_pred = network_predictions(self.agent._target_q_network, X_val)
        stats = classification_metrics(y_val, y_pred)
        avgQ = np.mean(decision_function(self.agent._target_q_network,
                                         X_val))  # Max action for each x in X

        if save_best is not None:
            if not hasattr(self, "best_score"):  # If no best model yet
                self.best_score = 0.0

            if stats.get(save_best) >= self.best_score:  # Overwrite best model
                self.save_network(
                )  # Saving directly to avoid shallow copy without trained weights
                self.best_score = stats.get(save_best)

        with self.writer.as_default():
            tf.summary.scalar(
                "AverageQ", avgQ,
                step=self.global_episode)  # Average Q-value for this epoch
            for k, v in stats.items():
                tf.summary.scalar(k, v, step=self.global_episode)

    def evaluate(self, X_test, y_test, X_train=None, y_train=None):
        """
        Final evaluation of trained Q-network with X_test and y_test.
        Optional PR and ROC curve comparison to X_train, y_train to ensure no overfitting is taking place.

        :param X_test: Features of test data, same shape as X_train
        :type  X_test: np.ndarray
        :param y_test: Labels of test data, same shape as y_train
        :type  y_test: np.ndarray
        :param X_train: Features of train data
        :type  X_train: np.ndarray
        :param y_train: Labels of train data
        :type  y_train: np.ndarray
        """
        if hasattr(self, "best_score"):
            print(f"\033[92mBest score: {self.best_score:6f}!\033[0m")
            network = self.load_network(
                self.model_path)  # Load best saved model
        else:
            network = self.agent._target_q_network  # Load latest target model

        if (X_train is not None) and (y_train is not None):
            plot_pr_curve(network, X_test, y_test, X_train, y_train)
            plot_roc_curve(network, X_test, y_test, X_train, y_train)

        y_pred = network_predictions(network, X_test)
        return classification_metrics(y_test, y_pred)

    def save_network(self):
        """Saves Q-network as pickle to `model_path`."""
        with open(self.model_path, "wb") as f:  # Save Q-network as pickle
            pickle.dump(self.agent._target_q_network, f)

    @staticmethod
    def load_network(fp: str):
        """Static method to load Q-network pickle from given filepath.

        :param fp: Filepath to the saved pickle of the network
        :type  fp: str

        :returns: The network-object loaded from a pickle file.
        :rtype: tensorflow.keras.models.Model
        """
        with open(fp, "rb") as f:  # Load the Q-network
            network = pickle.load(f)
        return network
Esempio n. 14
0
def create_bellman_pets_agent(
        env: TFPyEnvironment,
        agent_name: str = 'PETS_Agent',
        debug: bool = False,  # REQUIRED?
        reward_model_class: RewardModel = None,
        initial_state_distribution_model_class:
    InitialStateDistributionModel = None,
        training_step_counter: Optional[Any] = None,
        agent_params: Optional[Dict[str, Any]] = None) -> PetsAgent:
    """
    Function for creating a Bellman PETS agent in line with the Bellman
    implementation.
    This function builds an action network and uses this to instantiate the agent which is returned.

    :param env: TensorFlow Environment implementing the ControlledRandomWalk.
    :param num_epochs: Number of epochs for computing policy updates.
    :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when
        debugging.
    :param debug: Flag which toggles debugging in the PETS agent.
    :param reward_model_class: CRWRewardModel, dummy variable, currently extracted from env
    :param initial_state_distribution_model_class: CRWStateInitialiser, dummy variable, currently extracted from env
    :param training_step_counter: An optional counter to increment every time the train op of the
        agent is run. If None if provided it defaults to the global_step.
    :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up.
    :return: An instance of Bellman PETS agent.
    """
    # Process the action specification to attain the dimensions of the action subspaces to ensure
    # that in the case that there is only one resource set (and therefore only one action subspace)
    # the tuple of action specifications of length one is replaced by a single action specification.
    # This is to align with the fact that the actor network is implemented to return a tuple of
    # (OneHotCategorical) distributions (one for each resource set) where there are multiple action
    # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise.
    # First attain the action spec.
    # action_spec = env.action_spec()

    # Extract the shape of the subspaces from the action specification tuple.
    # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry.
    # action_subspace_dimensions = tuple(int(subspace.shape[-1]) for subspace in action_spec)

    # # Then test if there is only one action subspace.
    # if len(action_spec) == 1:
    #     # Pull out the only action spec.
    #     action_spec = action_spec[0]

    if agent_params is None:
        agent_params = dict()

    callbacks = [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)]

    # initializing given MDP components
    # NOTE: hacked for the time being, using quantities directly from the environment for now
    reward_model = reward_model_class(env.observation_spec(),
                                      env.action_spec(), env)
    initial_state_distribution_model = initial_state_distribution_model_class(
        env)

    # Set up the PETS agent in line with Bellman toolbox
    agent = PetsAgent(
        time_step_spec=env.time_step_spec(),
        action_spec=env.action_spec(),
        transition_model_type=agent_params.get(
            'transition_model_type',
            TransitionModelType.DeterministicEnsemble),
        num_hidden_layers=agent_params.get('num_hidden_layers', 3),
        num_hidden_nodes=agent_params.get('num_hidden_nodes', 250),
        activation_function=agent_params.get('activation_function',
                                             tf.nn.relu),
        ensemble_size=agent_params.get('ensemble_size', 5),
        predict_state_difference=agent_params.get('predict_state_difference',
                                                  True),
        epochs=agent_params.get('epochs', 100),
        training_batch_size=agent_params.get('training_batch_size', 32),
        callbacks=agent_params.get('callbacks', callbacks),
        reward_model=reward_model,
        initial_state_distribution_model=initial_state_distribution_model,
        trajectory_sampler_type=gent_params.get('trajectory_sampler_type',
                                                TrajectorySamplerType.TS1),
        trajectory_optimization_type=agent_params.get(
            'trajectory_optimization_type',
            TrajectoryOptimizationType.RandomShooting),
        horizon=agent_params.get('horizon', 25),
        population_size=agent_params.get('population_size', 2500),
        number_of_particles=agent_params.get('number_of_particles', 1),
        num_elites=agent_params.get('num_elites', 40),
        learning_rate=agent_params.get('learning_rate', 0.9),
        max_iterations=agent_params.get('max_iterations', 5),
        train_step_counter=training_step_counter,
    )

    return agent
Esempio n. 15
0
max_episode_steps = 27000 # <=> 108k ALE fromes since 1 step = 4 frames
environment_name = "BreakoutNoFrameskip-v4"

env = suite_gym.load(
    environment_name,
    max_episode_steps=max_episode_steps,
    gym_env_wrappers=[AtariPreprocessing, FrameStack4])

tf_env = TFPyEnvironment(env)

preprocessing_layer = keras.layers.Lambda(lambda obs: tf.cast(obs, np.float32) / 255.)
conv_layer_params = [(32,(8,8),4), (64,(4,4), 2), (64, (3,3), 1)]
fc_layer_params = [512]

q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params)

train_step = tf.Variable(0)
update_period = 4 # train the model every 4 steps
optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=1.0, decay_steps= 250000, end_learning_rate=0.01)

agent = DqnAgent(tf_env.time_step_spec(),
                    tf_env.action_spec(),
                    q_network=q_net,
                    optimizer=optimizer,
                    target_update_period=2000,
Esempio n. 16
0
gpu = tf.config.experimental.list_physical_devices('GPU')[0]
tf.config.experimental.set_memory_growth(gpu, True)

# 1. Creating the tf-environment for training.
carla_environment = CarlaEnvironment()
train_env = TFPyEnvironment(environment=carla_environment)

# 2. Constructing the Categorical QNetworks: Online & Target.
# Default Activation Function: "Gelu".
# Default Weight Initialization: "He (Xavier) Initialization".
fc_layer_units = [128, 128]
conv_layer_units = [ (4, 3, 1) ]
num_atoms = 51

online_q_net = CategoricalQNetwork(
    input_tensor_spec=train_env.observation_spec(),
    action_spec=train_env.action_spec(),
    num_atoms=num_atoms,
    conv_layer_params=conv_layer_units,
    fc_layer_params=fc_layer_units,
    activation_fn=GELU()
)
target_q_net = CategoricalQNetwork(
    input_tensor_spec=train_env.observation_spec(),
    action_spec=train_env.action_spec(),
    num_atoms=num_atoms,
    conv_layer_params=conv_layer_units,
    fc_layer_params=fc_layer_units,
    activation_fn=GELU()
)
Esempio n. 17
0
def create_ppo_agent(
        env: TFPyEnvironment,
        num_epochs: int = 10,
        gamma: float = 0.99,
        agent_name: str = 'PPO_Agent',
        debug: bool = False,
        training_step_counter: Optional[Any] = None,
        agent_params: Optional[Dict[str, Any]] = None) -> PPOAgent:
    """
    Function for creating a Proximal Policy Optimisation agent in line with the TensorFlow Agents
    implementation.
    This function builds an action network and uses this to instantiate the agent which is returned.

    :param env: TensorFlow Environment implementing the ControlledRandomWalk.
    :param num_epochs: Number of epochs for computing policy updates.
    :param gamma: Discount factor.
    :param agent_name: Name for the agent to aid in identifying TensorFlow variables etc. when
        debugging.
    :param debug: Flag which toggles debugging in the PPO agent.
    :param training_step_counter: An optional counter to increment every time the train op of the
        agent is run. If None if provided it defaults to the global_step.
    :param agent_params: A dictionary of possible overrides for the default TF-Agents agent set up.
    :return: An instance of TensorFlow Agents PPO agent.
    """
    # Process the action specification to attain the dimensions of the action subspaces to ensure
    # that in the case that there is only one resource set (and therefore only one action subspace)
    # the tuple of action specifications of length one is replaced by a single action specification.
    # This is to align with the fact that the actor network is implemented to return a tuple of
    # (OneHotCategorical) distributions (one for each resource set) where there are multiple action
    # subspaces and a single distribution (tfp.distributions.OneHotCategorical) otherwise.
    # First attain the action spec.
    action_spec = env.action_spec()

    # Extract the shape of the subspaces from the action specification tuple.
    # Action spaces are defined with shape (1, num_actions_for_resource_set) so take the -1th entry.
    action_subspace_dimensions = tuple(
        int(subspace.shape[-1]) for subspace in action_spec)

    # Then test if there is only one action subspace.
    if len(action_spec) == 1:
        # Pull out the only action spec.
        action_spec = action_spec[0]

    if agent_params is None:
        agent_params = dict()

    # Set up the action network. See `multi_headed_softmax_policy.py` for details.
    actor_network = MultiHeadedCategoricalActionNetwork(
        input_tensor_spec=env.observation_spec(),
        output_tensor_spec=action_spec,
        action_subspace_dimensions=action_subspace_dimensions,
        hidden_units=agent_params.get('hidden_units', (64, )))

    # PPO Requires a value network, we set one up using the default tf_agents set up.
    value_network = tf_agents.networks.value_network.ValueNetwork(
        env.observation_spec(),
        fc_layer_params=agent_params.get('value_fc_layer_params', (128, 64)),
        activation_fn=agent_params.get('value_net_activation_fn', tf.nn.tanh))

    # Set up the PPO agent in line with standard tf_agents.
    agent = PPOAgent(
        time_step_spec=env.time_step_spec(),
        action_spec=action_spec,
        actor_net=actor_network,
        optimizer=tf.compat.v1.train.AdamOptimizer(
            agent_params.get('learning_rate', 0.001)),
        value_net=value_network,
        importance_ratio_clipping=agent_params.get('importance_ratio_clipping',
                                                   0.0),
        lambda_value=agent_params.get('lambda_value', 0.95),
        discount_factor=gamma,
        policy_l2_reg=agent_params.get('policy_l2_reg', 0.0),
        value_function_l2_reg=agent_params.get('value_function_l2_reg', 0.0),
        value_pred_loss_coef=agent_params.get('value_pred_loss_coef', 0.5),
        num_epochs=num_epochs,
        use_gae=agent_params.get('use_gae', False),
        use_td_lambda_return=agent_params.get('use_td_lambda_return', False),
        normalize_rewards=agent_params.get('normalise_rewards', True),
        reward_norm_clipping=agent_params.get('reward_norm_clipping', 10),
        kl_cutoff_factor=agent_params.get('kl_cutoff_factor', 2.0),
        kl_cutoff_coef=agent_params.get('kl_cutoff_coef', 1000),
        initial_adaptive_kl_beta=agent_params.get('initial_adaptive_kl_beta',
                                                  1.0),
        adaptive_kl_target=agent_params.get('adaptive_kl_target', 0.01),
        adaptive_kl_tolerance=agent_params.get('adaptive_kl_tolerance', 0.3),
        normalize_observations=agent_params.get('normalize_observations',
                                                True),
        gradient_clipping=agent_params.get('gradient_clipping', None),
        debug_summaries=debug,
        summarize_grads_and_vars=debug,
        check_numerics=agent_params.get('check_numerics', False),
        entropy_regularization=agent_params.get('entropy_regularization', 0.0),
        train_step_counter=training_step_counter,
        name=agent_name)

    return agent
Esempio n. 18
0
BATCH_SIZE = 256  # Batch size (for replay buffer)
BUFFER_LENGTH = 131072  # Maximum number of steps in the buffer
STEPS_PER_ITER = 4096  # Steps collected per iteration (driver)
N_ITERATIONS = 1000  # Number of training iterations per session
EVAL_MAX_STEPS = 1000  # Maximum number of env steps during evaluation
COLLECT_RANDOM = True  # Use random policy to collect data

if __name__ == '__main__':
    # Create global step counter
    global_step = tf.compat.v1.train.get_or_create_global_step()

    # Create a dummy environment with no policy, just to extract the specs
    dummy_env = TFPyEnvironment(NineMensMorris(None, discount=DISCOUNT))

    # Create Q Network
    q_net = QNetwork(input_tensor_spec=dummy_env.observation_spec(),
                     action_spec=dummy_env.action_spec(),
                     fc_layer_params=(100, 600, 600, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 600, 600),
                     dropout_layer_params=(None, 0.1, 0.1, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.1, None))

    # Create agent
    agent = DdqnAgent(time_step_spec=dummy_env.time_step_spec(),
                      action_spec=dummy_env.action_spec(),
                      q_network=q_net,
                      optimizer=Adam(learning_rate=1e-4),
                      td_errors_loss_fn=common.element_wise_squared_loss,
                      epsilon_greedy=0.1,
                      train_step_counter=global_step)
    # Initialize agent
    agent.initialize()
    # Wrap the training function in a TF graph
Esempio n. 19
0
#     timestep = tf_env.step(np.array(np.random.choice([0,1,2,3]), dtype= np.int32))
#     if(timestep.is_last()):
#         print("game over", i)
#         break
#     tf_env.render(mode = "human")
#     time.sleep(0.2)

preprocessing_layers = keras.layers.Lambda(
    lambda obs: tf.cast(obs, np.float32) / 255.)

print("after preprocessing layer")
conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]

fc_layer_params = [512]

q_net = QNetwork(tf_env.observation_spec(),
                 tf_env.action_spec(),
                 preprocessing_layers=preprocessing_layers,
                 conv_layer_params=conv_layer_params,
                 fc_layer_params=fc_layer_params)

train_step = tf.Variable(0)
update_period = 4
optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                     rho=0.95,
                                     momentum=0.0,
                                     epsilon=0.00001,
                                     centered=True)

print("Before Epsilon function")
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
Esempio n. 20
0
    # env = suite_gym.load(env_name)
    # print(env.type)

    def create_env():
        return suite_gym.load(env_name)

    parallel_env = ParallelPyEnvironment(
        [create_env] * 4
    )
    train_env = TFPyEnvironment(parallel_env)
    # train_env = TFPyEnvironment(suite_gym.load(env_name))
    eval_env = TFPyEnvironment(suite_gym.load(env_name))

    fc_layer_params = (100,)
    q_net = QNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        fc_layer_params=fc_layer_params
    )
    train_step_counter = tf.Variable(0)

    agent = DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=Adam(learning_rate=LEARNING_RATE),
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter
    )
    agent.initialize()
Esempio n. 21
0
- no impule: red
- right impulse: green

In these state-space plots, the x-axis is the agents position and the y-axis is the velocity.
"""

# %%
batch_size = 64

training_spec = KerasTrainingSpec(
    epochs=5000,
    training_batch_size=256,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
    verbose=0,
)
linear_transition_network = LinearTransitionNetwork(tf_env.observation_spec())
trajectory_sampling_strategy = InfiniteHorizonTrajectorySampling(batch_size, 1)
transition_model = KerasTransitionModel(
    [linear_transition_network],
    tf_env.observation_spec(),
    tf_env.action_spec(),
)
reward_model = ConstantReward(tf_env.observation_spec(), tf_env.action_spec())
sample_transitions = sample_uniformly_distributed_transitions(
    transition_model, 1000, reward_model
)

# %%
plot_mountain_car_transitions(
    sample_transitions.observation.numpy(),
    sample_transitions.action.numpy(),
Esempio n. 22
0
def test_all_mepo_variants_work(transition_model, trajectory_sampler,
                                model_free_agent_type):
    """
    Mepo Agent has prespecified transition model, trajectory sampler and model-free agent
    types. Here we check that all combinations execute without errors.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    agent = MepoAgent(
        time_step_spec,
        action_spec,
        transition_model,
        1,
        10,
        tf.nn.relu,
        ensemble_size,
        False,
        1,
        1,
        [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
        reward_model,
        initial_state_distribution_model,
        trajectory_sampler,
        horizon,
        population_size,
        model_free_agent_type,
        1,
        10,
        tf.nn.relu,
        2,
    )

    # we need some training data
    random_policy = RandomTFPolicy(
        time_step_spec,
        action_spec,
        info_spec=agent.collect_policy.info_spec,
    )
    model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        random_policy.trajectory_spec, batch_size=1, max_length=1000)
    collect_driver_random_policy = TFDriver(
        tf_env,
        random_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    initial_time_step = tf_env.reset()
    collect_driver_random_policy.run(initial_time_step)
    pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10)
    tf_training_scheduler = pets_agent_trainer.create_training_scheduler(
        agent, model_training_buffer)
    training_losses = tf_training_scheduler.maybe_train(
        tf.constant(10, dtype=tf.int64))
    assert EnvironmentModelComponents.TRANSITION in training_losses

    # test the agent
    collect_driver_planning_policy = TFDriver(
        tf_env,
        agent.collect_policy,
        observers=[model_training_buffer.add_batch],
        max_steps=10,
        disable_tf_function=True,
    )
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
carla_environment = CarlaEnvironment(log_dir='validation_log4/')
eval_env = TFPyEnvironment(environment=carla_environment)

# Allowing tensorflow to expand in physical memory, if it fails to allocate memory in GPU.
gpu = tf.config.experimental.list_physical_devices('GPU')[0]
tf.config.experimental.set_memory_growth(gpu, True)

# 2. Constructing the Categorical QNetworks: Online & Target.
# Default Activation Function: "Gelu".
# Default Weight Initialization: "He (Xavier) Initialization".
fc_layer_units = [128, 128]
conv_layer_units = [(4, 3, 1)]
num_atoms = 51

online_q_net = CategoricalQNetwork(
    input_tensor_spec=eval_env.observation_spec(),
    action_spec=eval_env.action_spec(),
    num_atoms=num_atoms,
    conv_layer_params=conv_layer_units,
    fc_layer_params=fc_layer_units,
    activation_fn=GELU())
target_q_net = CategoricalQNetwork(
    input_tensor_spec=eval_env.observation_spec(),
    action_spec=eval_env.action_spec(),
    num_atoms=num_atoms,
    conv_layer_params=conv_layer_units,
    fc_layer_params=fc_layer_units,
    activation_fn=GELU())

# Defining train_step, which will be used to store the current step.
train_step = tf.Variable(initial_value=0)
Esempio n. 24
0
def breakout_v4(seed=42):
    env = suite_gym.load("Breakout-v4")
    env.seed(seed)
    env.reset()

    repeating_env = ActionRepeat(env, times=4)
    for name in dir(tf_agents.environments.wrappers):
        obj = getattr(tf_agents.environments.wrappers, name)
        if hasattr(obj, "__base__") and issubclass(
                obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper):
            print("{:27s} {}".format(name, obj.__doc__.split("\n")[0]))

    limited_repeating_env = suite_gym.load(
        "Breakout-v4",
        gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)],
        env_wrappers=[partial(ActionRepeat, times=4)],
    )

    max_episode_steps = 27000  # <=> 108k ALE frames since 1 step = 4 frames
    environment_name = "BreakoutNoFrameskip-v4"

    env = suite_atari.load(
        environment_name,
        max_episode_steps=max_episode_steps,
        gym_env_wrappers=[AtariPreprocessing, FrameStack4],
    )

    env.seed(42)
    env.reset()
    time_step = env.step(np.array(1))  # FIRE
    for _ in range(4):
        time_step = env.step(np.array(3))  # LEFT

    def plot_observation(obs):
        # Since there are only 3 color channels, you cannot display 4 frames
        # with one primary color per frame. So this code computes the delta between
        # the current frame and the mean of the other frames, and it adds this delta
        # to the red and blue channels to get a pink color for the current frame.
        obs = obs.astype(np.float32)
        img_ = obs[..., :3]
        current_frame_delta = np.maximum(
            obs[..., 3] - obs[..., :3].mean(axis=-1), 0.0)
        img_[..., 0] += current_frame_delta
        img_[..., 2] += current_frame_delta
        img_ = np.clip(img_ / 150, 0, 1)
        plt.imshow(img_)
        plt.axis("off")

    plt.figure(figsize=(6, 6))
    plot_observation(time_step.observation)
    plt.tight_layout()
    plt.savefig("./images/preprocessed_breakout_plot.png",
                format="png",
                dpi=300)
    plt.show()

    tf_env = TFPyEnvironment(env)

    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.0)
    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        preprocessing_layers=preprocessing_layer,
        conv_layer_params=conv_layer_params,
        fc_layer_params=fc_layer_params,
    )

    # see TF-agents issue #113
    # optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,
    #                                     epsilon=0.00001, centered=True)

    train_step = tf.Variable(0)
    update_period = 4  # run a training step every 4 collect steps
    optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4,
                                                    decay=0.95,
                                                    momentum=0.0,
                                                    epsilon=0.00001,
                                                    centered=True)
    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=250000 // update_period,  # <=> 1,000,000 ALE frames
        end_learning_rate=0.01,
    )  # final ε
    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=2000,  # <=> 32,000 ALE frames
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=train_step,
        epsilon_greedy=lambda: epsilon_fn(train_step),
    )
    agent.initialize()

    from tf_agents.replay_buffers import tf_uniform_replay_buffer

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=1000000)

    replay_buffer_observer = replay_buffer.add_batch

    class ShowProgress:
        def __init__(self, total):
            self.counter = 0
            self.total = total

        def __call__(self, trajectory):
            if not trajectory.is_boundary():
                self.counter += 1
            if self.counter % 100 == 0:
                print("\r{}/{}".format(self.counter, self.total), end="")

    from tf_agents.metrics import tf_metrics

    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]

    from tf_agents.eval.metric_utils import log_metrics
    import logging

    logging.getLogger().setLevel(logging.INFO)
    log_metrics(train_metrics)

    from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

    collect_driver = DynamicStepDriver(
        tf_env,
        agent.collect_policy,
        observers=[replay_buffer_observer] + train_metrics,
        num_steps=update_period,
    )  # collect 4 steps for each training iteration

    from tf_agents.policies.random_tf_policy import RandomTFPolicy

    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer.add_batch,
                   ShowProgress(20000)],
        num_steps=20000,
    )  # <=> 80,000 ALE frames
    final_time_step, final_policy_state = init_driver.run()
Esempio n. 25
0
def get_env_specs(c):
    dummy_env= TFPyEnvironment(StockEnvBasic(**c.default_env))
    return dummy_env.observation_spec(),dummy_env.action_spec()
Esempio n. 26
0
eval_interval = 100

if __name__ == '__main__':

    # Load the environments
    eval_py_env = get_env()
    eval_env = TFPyEnvironment(eval_py_env)
    train_env = TFPyEnvironment(
        ParallelPyEnvironment([get_env] * 4, start_serially=False))

    # Create a global step
    global_step = tf.compat.v1.train.get_or_create_global_step()

    # Create the actor network (with the normal distribution)
    actor_net = ActorDistributionNetwork(
        input_tensor_spec=train_env.observation_spec(),
        output_tensor_spec=train_env.action_spec(),
        fc_layer_params=(128, 256, 512, 512, 256),
        continuous_projection_net=normal_net)

    # Create the value network
    value_net = ValueNetwork(input_tensor_spec=train_env.observation_spec(),
                             fc_layer_params=(256, 512, 512))

    # Create the PPO agent
    ppo_agent = PPOClipAgent(time_step_spec=train_env.time_step_spec(),
                             action_spec=train_env.action_spec(),
                             optimizer=Adam(learning_rate=5e-4),
                             actor_net=actor_net,
                             value_net=value_net,
                             importance_ratio_clipping=0.2,