Exemple #1
0
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="CartPole-v1",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    actor_net = ActorDistributionNetwork(environment.observation_spec(),
                                         environment.action_spec(),
                                         fc_layer_params=(200, 100))
    value_net = ValueNetwork(environment.observation_spec(),
                             fc_layer_params=(200, 100))
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = PPOClipAgent(  # should be closer to the paper than PPOAgent...
        environment.time_step_spec(),
        environment.action_spec(),
        optimizer=tf.compat.v1.train.AdamOptimizer(
        ),  # default None does not work
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=True,
        lambda_value=0.5,
        discount_factor=0.95,
        train_step_counter=global_step,
    )

    agent_trainer = OnPolicyModelFreeAgentTrainer(400)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
    def test_ppo(self):
        env_class = PolicyUnittestEnv
        learning_rate = 1e-1
        iterations = 20
        batch_size = 100
        steps_per_episode = 13
        env = env_class(batch_size, steps_per_episode)
        env = TFPyEnvironment(env)

        eval_env = env_class(batch_size, steps_per_episode)
        eval_env = TFPyEnvironment(eval_env)

        algorithm = create_algorithm(env, learning_rate=learning_rate)
        driver = SyncOffPolicyDriver(env,
                                     algorithm,
                                     debug_summaries=DEBUGGING,
                                     summarize_grads_and_vars=DEBUGGING)
        replayer = driver.exp_replayer
        eval_driver = OnPolicyDriver(eval_env,
                                     algorithm,
                                     training=False,
                                     greedy_predict=True)

        env.reset()
        eval_env.reset()
        time_step = driver.get_initial_time_step()
        policy_state = driver.get_initial_policy_state()
        for i in range(iterations):
            time_step, policy_state = driver.run(max_num_steps=batch_size *
                                                 steps_per_episode,
                                                 time_step=time_step,
                                                 policy_state=policy_state)

            experience = replayer.replay_all()
            driver.train(experience, num_updates=4, mini_batch_size=25)
            replayer.clear()
            eval_env.reset()
            eval_time_step, _ = eval_driver.run(
                max_num_steps=(steps_per_episode - 1) * batch_size)
            logging.info("%d reward=%f", i,
                         float(tf.reduce_mean(eval_time_step.reward)))

        eval_env.reset()
        eval_time_step, _ = eval_driver.run(
            max_num_steps=(steps_per_episode - 1) * batch_size)
        logging.info("reward=%f", float(tf.reduce_mean(eval_time_step.reward)))
        self.assertAlmostEqual(1.0,
                               float(tf.reduce_mean(eval_time_step.reward)),
                               delta=1e-1)
def test_incorrect_termination_model():
    """
    The generic model-based agent should only allow a ConstantFalseTermination model.
    """

    # setup arguments for the model-based agent constructor
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    network = LinearTransitionNetwork(observation_spec)
    transition_model = KerasTransitionModel([network], observation_spec, action_spec)
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)
    termination_model = MountainCarTermination(observation_spec)
    policy = RandomTFPolicy(time_step_spec, action_spec)

    with pytest.raises(AssertionError) as excinfo:
        ModelBasedAgent(
            time_step_spec,
            action_spec,
            transition_model,
            reward_model,
            termination_model,
            initial_state_distribution_model,
            policy,
            policy,
        )

    assert "Only constant false termination supported" in str(excinfo.value)
Exemple #4
0
def test_tf_environment_with_random(n_episodes=20):
  """Test tf environment through random actions."""
  print(f'Testing tf environment over {n_episodes} episodes.')
  env = LakeMonsterEnvironment(**params)
  env = TFPyEnvironment(env)
  policy = RandomTFPolicy(time_step_spec=env.time_step_spec(),
                          action_spec=env.action_spec())

  ts = env.reset()
  rewards = []
  n_steps = []

  for _ in tqdm(range(n_episodes)):
    n_step = 0
    while not ts.is_last():
      action = policy.action(ts).action
      ts = env.step(action)
      n_step += 1

    reward = ts.reward
    rewards.append(reward)
    n_steps.append(n_step)
    ts = env.reset()

  # print results
  print('average num of steps per episode:', np.mean(n_steps))
  print('average reward per episode', np.mean(rewards))
    def test_alf_metrics(self, num_envs, learn_queue_cap, unroll_length,
                         actor_queue_cap, num_actors, num_iterations):
        episode_length = 5
        env_f = lambda: TFPyEnvironment(
            ValueUnittestEnv(batch_size=1, episode_length=episode_length))

        envs = [env_f() for _ in range(num_envs)]
        common.set_global_env(envs[0])
        alg = _create_ac_algorithm()
        driver = AsyncOffPolicyDriver(envs, alg, num_actors, unroll_length,
                                      learn_queue_cap, actor_queue_cap)
        driver.start()
        total_num_steps_ = 0
        for _ in range(num_iterations):
            total_num_steps_ += driver.run_async()
        driver.stop()

        total_num_steps = int(driver.get_metrics()[1].result())
        self.assertGreaterEqual(total_num_steps_, total_num_steps)

        # An exp is only put in the log queue after it's put in the learning queue
        # So when we stop the driver (which will force all queues to stop),
        # some exps might be missing from the metric. Here we assert an arbitrary
        # lower bound of 2/5. The upper bound is due to the fact that StepType.LAST
        # is not recorded by the metric (episode_length==5).
        self.assertLessEqual(total_num_steps, int(total_num_steps_ * 4 // 5))
        self.assertGreaterEqual(total_num_steps,
                                int(total_num_steps_ * 2 // 5))

        average_reward = int(driver.get_metrics()[2].result())
        self.assertEqual(average_reward, episode_length - 1)

        episode_length = int(driver.get_metrics()[3].result())
        self.assertEqual(episode_length, episode_length)
    def test_merlin_algorithm(self):
        batch_size = 100
        steps_per_episode = 15
        gap = 10
        env = RNNPolicyUnittestEnv(
            batch_size, steps_per_episode, gap, obs_dim=3)
        env = TFPyEnvironment(env)

        common.set_global_env(env)

        algorithm = _create_merlin_algorithm(
            learning_rate=1e-3, debug_summaries=False)
        driver = OnPolicyDriver(env, algorithm, train_interval=6)

        eval_driver = OnPolicyDriver(env, algorithm, training=False)

        proc = psutil.Process(os.getpid())

        policy_state = driver.get_initial_policy_state()
        time_step = driver.get_initial_time_step()
        for i in range(100):
            t0 = time.time()
            time_step, policy_state, _ = driver.run(
                max_num_steps=150 * batch_size,
                time_step=time_step,
                policy_state=policy_state)
            mem = proc.memory_info().rss // 1e6
            logging.info('%s time=%.3f mem=%s' % (i, time.time() - t0, mem))

        env.reset()
        time_step, _ = eval_driver.run(max_num_steps=14 * batch_size)
        logging.info("eval reward=%.3f" % tf.reduce_mean(time_step.reward))
        self.assertAlmostEqual(
            1.0, float(tf.reduce_mean(time_step.reward)), delta=1e-2)
    def test_actor_critic_continuous_policy(self):
        batch_size = 100
        steps_per_episode = 13
        env = PolicyUnittestEnv(batch_size,
                                steps_per_episode,
                                action_type=ActionType.Continuous)
        # We need to wrap env using TFPyEnvironment because the methods of env
        # has side effects (e.g, env._current_time_step can be changed)
        env = TFPyEnvironment(env)
        action_spec = env.action_spec()
        observation_spec = env.observation_spec()
        algorithm = ActorCriticAlgorithm(
            observation_spec=observation_spec,
            action_spec=action_spec,
            actor_network=ActorDistributionNetwork(observation_spec,
                                                   action_spec,
                                                   fc_layer_params=()),
            value_network=ValueNetwork(observation_spec, fc_layer_params=()),
            optimizer=tf.optimizers.Adam(learning_rate=1e-2))
        driver = OnPolicyDriver(env, algorithm, train_interval=2)
        eval_driver = OnPolicyDriver(env, algorithm, training=False)

        driver.run = tf.function(driver.run)

        t0 = time.time()
        driver.run(max_num_steps=2600 * batch_size)
        print("time=%s" % (time.time() - t0))

        env.reset()
        time_step, _ = eval_driver.run(max_num_steps=4 * batch_size)
        print("reward=%s" % tf.reduce_mean(time_step.reward))
        self.assertAlmostEqual(1.0,
                               float(tf.reduce_mean(time_step.reward)),
                               delta=5e-2)
Exemple #8
0
def test_tf_environment_wrapping():
    """
    Test wrapping the RL environment for use with TensorFlow Agents.

    Use Simple Server Queue for simplicity
    """
    # Set up single server queue.
    cost_per_buffer = np.ones((1, 1))
    initial_state = (0,)
    capacity = np.ones((1, 1)) * np.inf
    demand_rate_val = 0.7
    job_conservation_flag = True
    seed = 72

    demand_rate = np.array([demand_rate_val])[:, None]
    buffer_processing_matrix = - np.ones((1, 1))
    constituency_matrix = np.ones((1, 1))
    list_boundary_constraint_matrices = [constituency_matrix]

    # Construct environment.
    job_generator = ScaledBernoulliServicesPoissonArrivalsGenerator(
        demand_rate, buffer_processing_matrix, job_gen_seed=seed)
    assert job_generator.routes == {}
    state_initialiser = stinit.DeterministicCRWStateInitialiser(initial_state)
    env = RLControlledRandomWalk(cost_per_buffer, capacity, constituency_matrix, job_generator,
                                 state_initialiser, job_conservation_flag,
                                 list_boundary_constraint_matrices)

    # Try wrapping environment for tf agents.
    tf_env = TFPyEnvironment(GymWrapper(env))
    del tf_env
Exemple #9
0
def test_unknown_transition_model():
    """
    Pets Agent has prespecified transition model, RuntimeError should raise on unknown model.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(
        observation_spec)

    # trajectory optimiser
    trajectory_optimiser_type = TrajectoryOptimizationType.CrossEntropyMethod
    transition_model_type = "unknown_model"
    trajectory_sampler_type = TrajectorySamplerType.TS1

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    learning_rate = 0.9
    max_iterations = 5
    population_size = num_elites + 10
    number_of_particles = 1
    horizon = 1

    with pytest.raises(RuntimeError) as excinfo:
        PetsAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            trajectory_optimiser_type,
            horizon,
            population_size,
            number_of_particles,
            num_elites,
            learning_rate,
            max_iterations,
        )

    assert "Unknown transition model" in str(excinfo.value)
Exemple #10
0
def test_ensemble_size_set_correctly():
    """
    For ensemble transition models ensemble size needs to be larger than 1.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)

    # transition model and model-free agent
    transition_model_type = TransitionModelType.DeterministicEnsemble
    trajectory_sampler_type = TrajectorySamplerType.TS1
    model_free_agent_type = ModelFreeAgentType.Ppo

    # some parameters need to be set correctly
    ensemble_size = 1
    population_size = 10
    horizon = 1

    # define agent, many transition model and trajectory optimiser parameters can
    # be arbitrary
    with pytest.raises(AssertionError) as excinfo:
        MbpoAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            horizon,
            population_size,
            model_free_agent_type,
            1,
            10,
            tf.nn.relu,
            2,
            1,
        )

    assert "ensemble_size should be > 1" in str(excinfo.value)
Exemple #11
0
def test_unknown_transition_model():
    """
    Mepo Agent has prespecified transition model, RuntimeError should raise on unknown model.
    """

    # setup the environment and a prespecified model components
    py_env = suite_gym.load("MountainCarContinuous-v0")
    tf_env = TFPyEnvironment(py_env)
    time_step_spec = tf_env.time_step_spec()
    observation_spec = tf_env.observation_spec()
    action_spec = tf_env.action_spec()
    reward_model = MountainCarReward(observation_spec, action_spec)
    initial_state_distribution_model = MountainCarInitialState(observation_spec)

    # transition model and model-free agent
    transition_model_type = "unknown_model"
    trajectory_sampler_type = TrajectorySamplerType.TS1
    model_free_agent_type = ModelFreeAgentType.Ppo

    # some parameters need to be set correctly
    ensemble_size = 2
    num_elites = 10
    population_size = num_elites + 10
    horizon = 1

    with pytest.raises(RuntimeError) as excinfo:
        MbpoAgent(
            time_step_spec,
            action_spec,
            transition_model_type,
            1,
            10,
            tf.nn.relu,
            ensemble_size,
            False,
            1,
            1,
            [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)],
            reward_model,
            initial_state_distribution_model,
            trajectory_sampler_type,
            horizon,
            population_size,
            model_free_agent_type,
            1,
            10,
            tf.nn.relu,
            2,
            1,
        )

    assert "Unknown transition model" in str(excinfo.value)
Exemple #12
0
def _create_environment_and_policy(batch_size):
    tf_batched_environment = TFPyEnvironment(
        BatchedPyEnvironment([
            PyEnvironmentMock(final_state=TRAJECTORY_LENGTH)
            for _ in range(batch_size)
        ]))
    policy = TFPolicyMock(
        tf_batched_environment.time_step_spec(),
        tf_batched_environment.action_spec(),
        batch_size=batch_size,
    )

    return tf_batched_environment, policy
Exemple #13
0
def load_model_checkpoint(c):#returns the model at given chkpoint

    dir_name = tf.train.latest_checkpoint(c.model_dir)
    #if ver_name =='None':
    #    check_or_make_dir(dir_name)
        
    #else:
    #    dir_name = os.path.join(dir_name,ver_name)
    dummy_env= TFPyEnvironment(StockEnvBasic(**c.default_env))
    time_step = dummy_env.reset()

    temp = ValueNet(**c.model_vars)
    #initialize model
    temp(time_step.observation)
    checkpoint2 = tf.train.Checkpoint(module=temp)
    status=checkpoint2.restore(dir_name)
    return temp,checkpoint2
Exemple #14
0
    def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float,
                replay_buffer_max_length: int, layers: dict) -> None:
        """
        Create the Q-network, agent and policy

        Args:
            X_train: A np.ndarray for training samples.
            y_train: A np.ndarray for the class labels of the training samples.
            lr: learn rate for the optimizer (default Adam)
            epsilon: Used for the default epsilon greedy policy for choosing a random action.
            gamma: The discount factor for learning Q-values
            imb_ratio: ratio of imbalance. Used to specifiy reward in the environment
            replay_buffer_max_length: Maximum lenght of replay memory.
            layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout).
        """

        dense_layers = layers.get("dense")
        conv_layers = layers.get("conv")
        dropout_layers = layers.get("dropout")

        self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio))  # create a custom environment

        q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers,
                         fc_layer_params=dense_layers, dropout_layer_params=dropout_layers)

        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr)

        train_step_counter = tf.Variable(0)

        self.agent = DqnAgent(
            self.train_env.time_step_spec(),
            self.train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            td_errors_loss_fn=common.element_wise_squared_loss,
            train_step_counter=train_step_counter,
            gamma=gamma,
            epsilon_greedy=epsilon,
        )

        self.agent.initialize()

        self.replay_buffer = TFUniformReplayBuffer(
            data_spec=self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=replay_buffer_max_length)
Exemple #15
0
def test_planning_policy_batch_environment_model():
    """
    Ensure that planning policy is operational.
    """

    # number of trajectories for planning and planning horizon
    population_size = 3
    planner_horizon = 5
    number_of_particles = 1

    # setup the environment and a model of it
    py_env = suite_gym.load("MountainCar-v0")
    tf_env = TFPyEnvironment(py_env)
    reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec())
    terminates = MountainCarTermination(tf_env.observation_spec())
    network = LinearTransitionNetwork(tf_env.observation_spec())
    transition_model = KerasTransitionModel(
        [network],
        tf_env.observation_spec(),
        tf_env.action_spec(),
    )
    initial_state = MountainCarInitialState(tf_env.observation_spec())
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=terminates,
        initial_state_distribution_model=initial_state,
    )

    # setup the trajectory optimiser
    random_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                   tf_env.action_spec())
    trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy,
                                                     planner_horizon,
                                                     population_size,
                                                     number_of_particles)
    planning_policy = PlanningPolicy(environment_model, trajectory_optimiser)

    # test whether it runs
    collect_driver_planning_policy = DynamicEpisodeDriver(tf_env,
                                                          planning_policy,
                                                          num_episodes=1)
    time_step = tf_env.reset()
    collect_driver_planning_policy.run(time_step)
Exemple #16
0
def evaluate_episode(policy, env_params):
    """Use naive while loop to evaluate policy in single episode."""
    if 'n_monsters' in env_params:
        env = MultiMonsterEnvironment
    elif 'is_jumping' in env_params:
        env = JumpingEnvironment
    else:
        env = LakeMonsterEnvironment
    py_env = env(**env_params)
    tf_env = TFPyEnvironment(py_env)
    ts = tf_env.reset()
    n_steps = 0
    while not ts.is_last():
        action = policy.action(ts)
        ts = tf_env.step(action.action)
        n_steps += 1

    reward = ts.reward.numpy().item()
    return reward, n_steps * py_env.step_size
Exemple #17
0
def collection_run(c, the_buffer, date_strs, cdict):
    """Fills replay buffer with experience from date_strs
    """
    n_trades = 0
    for date_str in date_strs:
        print(date_str)
        contract_str = cdict[date_str]
        the_env = StockEnvBasic(date_str, contract_str, **c.stock_env)
        the_env = TFPyEnvironment(the_env)
        time_step = the_env.reset()

        while not tf.reduce_all(time_step.is_last()):
            action = 1
            next_ts = the_env.step(action)
            pol_step = policy_step.PolicyStep(action)
            traj = trajectory.from_transition(time_step, pol_step, next_ts)
            the_buffer.add_batch(traj)
            time_step = next_ts
    return the_buffer
Exemple #18
0
 def create_policy_eval_video(self,
                              env,
                              policy,
                              filename,
                              num_episodes=5,
                              fps=30):
     filename = filename + ".mp4"
     tf_env = TFPyEnvironment(env)
     with imageio.get_writer(filename, fps=fps) as video:
         for _ in range(num_episodes):
             time_step = tf_env.reset()
             tf_env.step(1)
             video.append_data(env.render())
             while not time_step.is_last():
                 action_step = policy.action(time_step)
                 time_step = tf_env.step(action_step.action)
                 video.append_data(env.render())
         video.close()
     return self.embed_mp4(filename)
Exemple #19
0
def create_many_policy_gif(uid, file_path, monster_speed=4.0):
    """Create a gif superimposing the actions of many policies."""
    n_steps = 300  # = timeout_factor / step_size
    step_size = 0.01
    fps = 10
    p_paths = glob.glob(configs.POLICY_DIR + uid + '*')

    all_positions = []
    colors = []
    for p_path in tqdm(p_paths):
        color = (np.random.randint(256), np.random.randint(128), 0)
        policy = tf.saved_model.load(p_path)
        env_params = policy.get_metadata()
        env_params = tf_to_py(env_params)

        # overwriting parameters
        env_params['step_size'] = step_size
        env_params['monster_speed'] = monster_speed
        py_env = LakeMonsterEnvironment(**env_params)
        tf_env = TFPyEnvironment(py_env)

        time_step = tf_env.reset()
        agent_positions = {}
        for step in range(n_steps):
            if not time_step.is_last():
                action = policy.action(time_step)
                time_step = tf_env.step(action.action)
            theta = py_env.total_monster_rotation - py_env.total_agent_rotation
            c, s = np.cos(theta), np.sin(theta)
            rot_matrix = np.array(((c, -s), (s, c)))
            agent_positions[step] = np.dot(rot_matrix, np.array((py_env.r, 0)))
        all_positions.append(agent_positions)
        colors.append(color)

    with imageio.get_writer(file_path, mode='I', fps=fps) as gif:
        for step in range(n_steps):
            positions = [item[step] for item in all_positions]
            im = render_many_agents(positions, colors, step, step_size, 4,
                                    monster_speed)
            gif.append_data(np.array(im))
    pygifsicle.optimize(file_path)
Exemple #20
0
def episode_as_video(py_env, policy, filepath, fps=10):
    """Create mp4 video through py_environment render method."""

    tf_env = TFPyEnvironment(py_env)
    with imageio.get_writer('tmp.mp4', fps=fps) as video:
        time_step = tf_env.reset()
        video.append_data(py_env.render())
        while not time_step.is_last():
            action = policy.action(time_step).action
            time_step = tf_env.step(action)
            video.append_data(py_env.render())
        for _ in range(3 * fps):  # play for 3 more seconds
            video.append_data(py_env.render())

    # giving video file a more descriptive name
    _, result = py_env.determine_reward()

    assert filepath.split('.')[1] == 'mp4'
    split = filepath.split('.')
    split[0] += '-' + result
    filepath = '.'.join(split)
    os.rename('tmp.mp4', filepath)
def load_models():
  """Load existing saved models."""
  if os.path.exists(configs.AGENT_ID_PATH):

    deploy.save_model()
    agent = train.restore_existing_agent()
    model1 = verify.ModelWrapper(agent.q_net, agent.use_categorical)

    p = agent.env_params
    p['monster_speed'] = 3.5
    p['step_size'] = 0.02
    p['use_random_start'] = False

    env = LakeMonsterEnvironment(**p)
    step_increase = env.step_size / env.timeout_factor
    env = TFPyEnvironment(env)

    savepath = os.path.join(configs.TEMP_DIR, 'saved_model')
    model2 = tf.keras.models.load_model(savepath)

    return model1, model2, agent, env, step_increase

  return None
    def test_alf_metrics(self, num_envs, learn_queue_cap, unroll_length,
                         actor_queue_cap, num_actors, num_iterations):
        episode_length = 5
        env_f = lambda: TFPyEnvironment(
            ValueUnittestEnv(batch_size=1, episode_length=episode_length))
        alg = _create_ac_algorithm(env_f())
        driver = AsyncOffPolicyDriver(env_f, alg, num_envs, num_actors,
                                      unroll_length, learn_queue_cap,
                                      actor_queue_cap)
        driver.start()
        total_num_steps_ = 0
        for _ in range(num_iterations):
            total_num_steps_ += driver.run_async()
        driver.stop()

        total_num_steps = int(driver.get_metrics()[1].result())
        self.assertGreaterEqual(total_num_steps_, total_num_steps)
        self.assertGreaterEqual(
            total_num_steps,  # multiply by 2/3 because 1/3 of steps are StepType.LAST
            total_num_steps_ * 2 // 3)
        average_reward = int(driver.get_metrics()[2].result())
        self.assertEqual(average_reward, episode_length - 1)
        episode_length = int(driver.get_metrics()[3].result())
        self.assertEqual(episode_length, episode_length)
Exemple #23
0
def episode_as_gif(py_env, policy, save_path, fps=10, show_path=True):
    """Create gif through py_environment render method."""

    tf_env = TFPyEnvironment(py_env)
    path = []
    with imageio.get_writer(save_path, mode='I', fps=fps) as gif:
        time_step = tf_env.reset()
        # using the policy_state to deal with scripted_policy possibility
        policy_state = policy.get_initial_state(batch_size=1)
        gif.append_data(py_env.render())

        while not time_step.is_last():
            action = policy.action(time_step, policy_state)
            time_step = tf_env.step(action.action)
            im, real_position = py_env.render('return_real')
            path.append(real_position)
            if show_path:
                im = render_agent_path(im, path)
            policy_state = action.state
            gif.append_data(np.array(im))

        for _ in range(fps):  # play for 1 more seconds
            gif.append_data(py_env.render())
    pygifsicle.optimize(save_path)
Exemple #24
0
"""


# %matplotlib inline


# %% [markdown]
"""
This example is based on Mountain Car because it is straightforward to visualise the state space
and the linearised dynamics contain enough information to learn a controller using reinforcement
learning.
"""

# %%
global_step = tf.compat.v1.train.get_or_create_global_step()
tf_env = TFPyEnvironment(suite_gym.load("MountainCar-v0"))

# %% [markdown]
"""
## Dynamics model

We define the linear model that is going to be used to model the transition function of this
environment. We plot the predicted dynamics of the untrained model. The action space of the
mountain car environment consists of three discrete elements. These are represented in all
subsequent plots with three colours:
- left impulse: blue
- no impule: red
- right impulse: green

In these state-space plots, the x-axis is the agents position and the y-axis is the velocity.
"""
Exemple #25
0
def get_env_specs(c):
    dummy_env= TFPyEnvironment(StockEnvBasic(**c.default_env))
    return dummy_env.observation_spec(),dummy_env.action_spec()
Exemple #26
0
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="Pendulum-v0",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    critic_network = CriticNetwork(
        input_tensor_spec=(environment.observation_spec(),
                           environment.action_spec()),
        observation_fc_layer_params=None,
        action_fc_layer_params=None,
        joint_fc_layer_params=(200, 100),
    )
    actor_network = ActorNetwork(
        input_tensor_spec=environment.observation_spec(),
        output_tensor_spec=environment.action_spec(),
        fc_layer_params=(200, 100),
    )
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DdpgAgent(
        time_step_spec=environment.time_step_spec(),
        action_spec=environment.action_spec(),
        critic_network=critic_network,
        actor_network=actor_network,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(),
        train_step_counter=global_step,
    )

    agent_trainer = OffPolicyModelFreeAgentTrainer(1, 256)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
Exemple #27
0
def create_environment(agent_mailbox, service_mailbox,
                       discount) -> TFPyEnvironment:
    environment = PongEnvironment(agent_mailbox, service_mailbox, discount)
    return TFPyEnvironment(environment)
from tensorflow_addons.optimizers.yogi import Yogi
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.losses import Huber
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.networks.categorical_q_network import CategoricalQNetwork
from tf_agents.agents.categorical_dqn.categorical_dqn_agent import CategoricalDqnAgent
from tf_agents.utils.common import Checkpointer
from environment.carla_environment import CarlaEnvironment

# Allowing tensorflow to expand in physical memory, if it fails to allocate memory in GPU.
gpu = tf.config.experimental.list_physical_devices('GPU')[0]
tf.config.experimental.set_memory_growth(gpu, True)

# 1. Creating the tf-environment for validation.
carla_environment = CarlaEnvironment(log_dir='validation_log4/')
eval_env = TFPyEnvironment(environment=carla_environment)

# Allowing tensorflow to expand in physical memory, if it fails to allocate memory in GPU.
gpu = tf.config.experimental.list_physical_devices('GPU')[0]
tf.config.experimental.set_memory_growth(gpu, True)

# 2. Constructing the Categorical QNetworks: Online & Target.
# Default Activation Function: "Gelu".
# Default Weight Initialization: "He (Xavier) Initialization".
fc_layer_units = [128, 128]
conv_layer_units = [(4, 3, 1)]
num_atoms = 51

online_q_net = CategoricalQNetwork(
    input_tensor_spec=eval_env.observation_spec(),
    action_spec=eval_env.action_spec(),
 def wrap_env(self):
     tf_env = TFPyEnvironment(self.env)
     return tf_env
            losses[step, tr] = agent.train(experience=trajectories).loss
            for met in range(len(train_metrics)):
                train_metric_results[
                    met, step, tr] = train_metric_inst[met].result().numpy()
        evaluations[step, :] = _evaluate_dyke_agent(eval_env, agent,
                                                    eval_episodes)
    return {
        'loss': losses,
        'eval': evaluations,
        'train-metrics': train_metric_results
    }


if __name__ == '__main__':
    train_py_env = DykeEnvironment(**dyke_environment_demo_params())
    train_tf_env = TFPyEnvironment(train_py_env)
    eval_py_env = DykeEnvironment(**dyke_environment_demo_params())
    eval_tf_env = TFPyEnvironment(eval_py_env)

    dqn_agent = dyke_dqn_agent(train_tf_env)  # could also have been eval env
    dqn_agent.initialize()

    num_train_steps: int = int(1e2)
    spe: int = int(np.ceil(train_py_env.timeout_time / train_py_env.delta_t))

    # bat = _dyke_replay_buffer(train_tf_env, dqn_agent, int(1e4))
    # print('Running the episode driver!')
    # _, _ = DynamicStepDriver(train_tf_env, dqn_agent.policy, [bat.add_batch], num_steps=spe).run()
    # print('Done!')
    #
    # it = iter(bat.as_dataset())