def test_incorrect_termination_model(): """ The generic model-based agent should only allow a ConstantFalseTermination model. """ # setup arguments for the model-based agent constructor py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() network = LinearTransitionNetwork(observation_spec) transition_model = KerasTransitionModel([network], observation_spec, action_spec) reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState(observation_spec) termination_model = MountainCarTermination(observation_spec) policy = RandomTFPolicy(time_step_spec, action_spec) with pytest.raises(AssertionError) as excinfo: ModelBasedAgent( time_step_spec, action_spec, transition_model, reward_model, termination_model, initial_state_distribution_model, policy, policy, ) assert "Only constant false termination supported" in str(excinfo.value)
def collect_steps(env: tf_py_environment.TFPyEnvironment, policy: tf_policy.Base, buffer: ReplayBuffer): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) buffer.add_batch(traj)
def dyke_dqn_agent(env: TFPyEnvironment, layers: Optional[List[Layer]] = None) -> DqnAgent: """ Prepares a deep Q-network (DQN) agent for use in the dyke maintenance environment. :param env: The dyke environment on which to base the DQN agent. :param layers: Optional. A list of layers to supply to the DQN agent's network. :return: The agent. """ layers = fully_connected_dyke_dqn_agent_network( sizes=(100, 50)) if layers is None else layers # prepare the Q-values layer action_as: BoundedArraySpec = from_spec(env.action_spec()) number_actions: int = int(action_as.maximum - action_as.minimum + 1) q_values_layer: Layer = Dense(units=number_actions, activation=None, kernel_initializer=RandomUniform( minval=-3e-3, maxval=3e-3), bias_initializer=Constant(-2e-1)) net = Sequential(layers=layers + [q_values_layer]) # instantiate and return the agent optimizer = Adam(learning_rate=1e-3) train_step_counter = Variable(initial_value=0) return DqnAgent(time_step_spec=env.time_step_spec(), action_spec=env.action_spec(), q_network=net, optimizer=optimizer, epsilon_greedy=0.1, td_errors_loss_fn=element_wise_squared_loss, train_step_counter=train_step_counter)
def save_environment_agent_video( filename: str, agent: tf_agent.TFAgent, tf_env: TFPyEnvironment, py_env: TimeLimit, num_episodes: int = 1, ) -> None: """ Save a video of an agent acting in the environment. Render method needs to be available in the python version of the environment. TODO: - how to prevent opening a window when saving a video? - sometimes nothing is saved? - gym wrappers monitoring VideoRecorder :param filename: A valid path to which a file with the video will be saved. :param agent: An agent whose policy will be evaluated. :param tf_env: A TensorFlow environment used for interaction with the agent. :param py_env: A Python OpenAI Gym environment used for rendering the video. Environment has to provide `render` method. :param num_episodes: A number of episodes to evaluate. :return: A video is saved to filename. """ with imageio.get_writer(filename, fps=60) as video: for _ in range(num_episodes): time_step = tf_env.reset() video.append_data(py_env.render()) while not time_step.is_last(): action_step = agent.policy.action(time_step) time_step = tf_env.step(action_step.action) video.append_data(py_env.render()) py_env.close()
def compute_total_reward(env: TFPyEnvironment, policy): total_reward = 0.0 time_step = env.reset() while not time_step.is_last(): policy_step = policy.action(time_step) time_step = env.step(policy_step.action) total_reward += time_step.reward return total_reward.numpy()[0]
def collect_step(env: tf_py_environment.TFPyEnvironment, policy, buffer): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj)
def step( environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer ) -> typing.Tuple[float, bool]: time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) replay_buffer.add_batch(traj) return next_time_step.reward.numpy()[0], next_time_step.is_last()
def test_unknown_transition_model(): """ Pets Agent has prespecified transition model, RuntimeError should raise on unknown model. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState( observation_spec) # trajectory optimiser trajectory_optimiser_type = TrajectoryOptimizationType.CrossEntropyMethod transition_model_type = "unknown_model" trajectory_sampler_type = TrajectorySamplerType.TS1 # some parameters need to be set correctly ensemble_size = 2 num_elites = 10 learning_rate = 0.9 max_iterations = 5 population_size = num_elites + 10 number_of_particles = 1 horizon = 1 with pytest.raises(RuntimeError) as excinfo: PetsAgent( time_step_spec, action_spec, transition_model_type, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler_type, trajectory_optimiser_type, horizon, population_size, number_of_particles, num_elites, learning_rate, max_iterations, ) assert "Unknown transition model" in str(excinfo.value)
def test_ensemble_size_set_correctly(): """ For ensemble transition models ensemble size needs to be larger than 1. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState(observation_spec) # transition model and model-free agent transition_model_type = TransitionModelType.DeterministicEnsemble trajectory_sampler_type = TrajectorySamplerType.TS1 model_free_agent_type = ModelFreeAgentType.Ppo # some parameters need to be set correctly ensemble_size = 1 population_size = 10 horizon = 1 # define agent, many transition model and trajectory optimiser parameters can # be arbitrary with pytest.raises(AssertionError) as excinfo: MbpoAgent( time_step_spec, action_spec, transition_model_type, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler_type, horizon, population_size, model_free_agent_type, 1, 10, tf.nn.relu, 2, 1, ) assert "ensemble_size should be > 1" in str(excinfo.value)
def test_unknown_transition_model(): """ Mepo Agent has prespecified transition model, RuntimeError should raise on unknown model. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState(observation_spec) # transition model and model-free agent transition_model_type = "unknown_model" trajectory_sampler_type = TrajectorySamplerType.TS1 model_free_agent_type = ModelFreeAgentType.Ppo # some parameters need to be set correctly ensemble_size = 2 num_elites = 10 population_size = num_elites + 10 horizon = 1 with pytest.raises(RuntimeError) as excinfo: MbpoAgent( time_step_spec, action_spec, transition_model_type, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler_type, horizon, population_size, model_free_agent_type, 1, 10, tf.nn.relu, 2, 1, ) assert "Unknown transition model" in str(excinfo.value)
def train_dyke_agent(train_env: TFPyEnvironment, eval_env: TFPyEnvironment, agent: DqnAgent, train_steps: int, steps_per_episode: int, eval_episodes: int) -> Dict[str, Any]: """ Trains the DQN agent on the dyke maintenance task. :param train_env: The training environment. :param eval_env: The environment for testing agent performance. :param agent: The agent. :param train_steps: The number of training steps to use. :param steps_per_episode: The number of time steps that can be taken in a single dyke environment episode. :param eval_episodes: The number of episodes to use per evaluation. :return: A mapping to various metrics pertaining to the training's results. """ losses: np.ndarray = np.zeros(shape=(train_steps, steps_per_episode)) evaluations: np.ndarray = np.zeros(shape=(train_steps, eval_episodes)) train_metrics: Tuple = (AverageReturnMetric, ) train_metric_results: np.ndarray = np.zeros(shape=(len(train_metrics), train_steps, steps_per_episode)) for step in range(train_steps): # we uniformly sample experiences (single time steps) from one episode per train step print('STEP %d/%d' % (step + 1, train_steps)) train_env.reset() rep_buf = _dyke_replay_buffer(train_env, agent, steps_per_episode) train_metric_inst: Tuple = tuple( [metric() for metric in train_metrics]) # instantiate the metrics obs: Tuple = (rep_buf.add_batch, ) + train_metric_inst _ = DynamicStepDriver( env=train_env, policy=agent.collect_policy, observers=obs, num_steps=steps_per_episode ).run( ) # experience a single episode using the agent's current configuration dataset: tf.data.Dataset = rep_buf.as_dataset( sample_batch_size=_REP_BUF_BATCH_SIZE, num_steps=_REP_BUF_NUM_STEPS) iterator = iter(dataset) for tr in range(steps_per_episode): trajectories, _ = next(iterator) losses[step, tr] = agent.train(experience=trajectories).loss for met in range(len(train_metrics)): train_metric_results[ met, step, tr] = train_metric_inst[met].result().numpy() evaluations[step, :] = _evaluate_dyke_agent(eval_env, agent, eval_episodes) return { 'loss': losses, 'eval': evaluations, 'train-metrics': train_metric_results }
def _evaluate_dyke_agent(env: TFPyEnvironment, agent: DqnAgent, num_episodes: int = 10) -> np.ndarray: returns: np.ndarray = np.zeros(shape=(num_episodes, )) for ep in range(num_episodes): time_step: TimeStep = env.reset() episode_return: float = 0.0 while not time_step.is_last(): action_step = agent.policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward returns[ep] = episode_return return returns
def _create_environment_and_policy(batch_size): tf_batched_environment = TFPyEnvironment( BatchedPyEnvironment([ PyEnvironmentMock(final_state=TRAJECTORY_LENGTH) for _ in range(batch_size) ])) policy = TFPolicyMock( tf_batched_environment.time_step_spec(), tf_batched_environment.action_spec(), batch_size=batch_size, ) return tf_batched_environment, policy
def train_eval( # tensorboard files root_dir, # environment env_name="CartPole-v1", random_seed=0, # Params for collect num_environment_steps=100000, replay_buffer_capacity=1001, # Per-environment # Params for eval num_eval_episodes=30, eval_interval=200, # Params for summaries summary_interval=50, ): tf.compat.v1.set_random_seed(random_seed) environment = TFPyEnvironment(suite_gym.load(env_name)) evaluation_environment = TFPyEnvironment(suite_gym.load(env_name)) actor_net = ActorDistributionNetwork(environment.observation_spec(), environment.action_spec(), fc_layer_params=(200, 100)) value_net = ValueNetwork(environment.observation_spec(), fc_layer_params=(200, 100)) global_step = tf.compat.v1.train.get_or_create_global_step() agent = PPOClipAgent( # should be closer to the paper than PPOAgent... environment.time_step_spec(), environment.action_spec(), optimizer=tf.compat.v1.train.AdamOptimizer( ), # default None does not work actor_net=actor_net, value_net=value_net, importance_ratio_clipping=0.2, normalize_observations=False, normalize_rewards=False, use_gae=True, lambda_value=0.5, discount_factor=0.95, train_step_counter=global_step, ) agent_trainer = OnPolicyModelFreeAgentTrainer(400) experiment_harness = ExperimentHarness( root_dir, environment, evaluation_environment, agent, agent_trainer, replay_buffer_capacity, num_environment_steps, summary_interval, eval_interval, num_eval_episodes, number_of_initial_random_policy_steps=0, use_tf_function=True, ) experiment_harness.run()
def compute_avg_return(env: tf_py_environment.TFPyEnvironment, policy, num_episodes): total_return = 0.0 for _ in range(num_episodes): time_step = env.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward total_return += episode_return avg_return = total_return / num_episodes return avg_return.numpy()[0]
def test_alf_metrics(self, num_envs, learn_queue_cap, unroll_length, actor_queue_cap, num_actors, num_iterations): episode_length = 5 env_f = lambda: TFPyEnvironment( ValueUnittestEnv(batch_size=1, episode_length=episode_length)) envs = [env_f() for _ in range(num_envs)] common.set_global_env(envs[0]) alg = _create_ac_algorithm() driver = AsyncOffPolicyDriver(envs, alg, num_actors, unroll_length, learn_queue_cap, actor_queue_cap) driver.start() total_num_steps_ = 0 for _ in range(num_iterations): total_num_steps_ += driver.run_async() driver.stop() total_num_steps = int(driver.get_metrics()[1].result()) self.assertGreaterEqual(total_num_steps_, total_num_steps) # An exp is only put in the log queue after it's put in the learning queue # So when we stop the driver (which will force all queues to stop), # some exps might be missing from the metric. Here we assert an arbitrary # lower bound of 2/5. The upper bound is due to the fact that StepType.LAST # is not recorded by the metric (episode_length==5). self.assertLessEqual(total_num_steps, int(total_num_steps_ * 4 // 5)) self.assertGreaterEqual(total_num_steps, int(total_num_steps_ * 2 // 5)) average_reward = int(driver.get_metrics()[2].result()) self.assertEqual(average_reward, episode_length - 1) episode_length = int(driver.get_metrics()[3].result()) self.assertEqual(episode_length, episode_length)
def test_merlin_algorithm(self): batch_size = 100 steps_per_episode = 15 gap = 10 env = RNNPolicyUnittestEnv( batch_size, steps_per_episode, gap, obs_dim=3) env = TFPyEnvironment(env) common.set_global_env(env) algorithm = _create_merlin_algorithm( learning_rate=1e-3, debug_summaries=False) driver = OnPolicyDriver(env, algorithm, train_interval=6) eval_driver = OnPolicyDriver(env, algorithm, training=False) proc = psutil.Process(os.getpid()) policy_state = driver.get_initial_policy_state() time_step = driver.get_initial_time_step() for i in range(100): t0 = time.time() time_step, policy_state, _ = driver.run( max_num_steps=150 * batch_size, time_step=time_step, policy_state=policy_state) mem = proc.memory_info().rss // 1e6 logging.info('%s time=%.3f mem=%s' % (i, time.time() - t0, mem)) env.reset() time_step, _ = eval_driver.run(max_num_steps=14 * batch_size) logging.info("eval reward=%.3f" % tf.reduce_mean(time_step.reward)) self.assertAlmostEqual( 1.0, float(tf.reduce_mean(time_step.reward)), delta=1e-2)
def test_actor_critic_continuous_policy(self): batch_size = 100 steps_per_episode = 13 env = PolicyUnittestEnv(batch_size, steps_per_episode, action_type=ActionType.Continuous) # We need to wrap env using TFPyEnvironment because the methods of env # has side effects (e.g, env._current_time_step can be changed) env = TFPyEnvironment(env) action_spec = env.action_spec() observation_spec = env.observation_spec() algorithm = ActorCriticAlgorithm( observation_spec=observation_spec, action_spec=action_spec, actor_network=ActorDistributionNetwork(observation_spec, action_spec, fc_layer_params=()), value_network=ValueNetwork(observation_spec, fc_layer_params=()), optimizer=tf.optimizers.Adam(learning_rate=1e-2)) driver = OnPolicyDriver(env, algorithm, train_interval=2) eval_driver = OnPolicyDriver(env, algorithm, training=False) driver.run = tf.function(driver.run) t0 = time.time() driver.run(max_num_steps=2600 * batch_size) print("time=%s" % (time.time() - t0)) env.reset() time_step, _ = eval_driver.run(max_num_steps=4 * batch_size) print("reward=%s" % tf.reduce_mean(time_step.reward)) self.assertAlmostEqual(1.0, float(tf.reduce_mean(time_step.reward)), delta=5e-2)
def test_tf_environment_wrapping(): """ Test wrapping the RL environment for use with TensorFlow Agents. Use Simple Server Queue for simplicity """ # Set up single server queue. cost_per_buffer = np.ones((1, 1)) initial_state = (0,) capacity = np.ones((1, 1)) * np.inf demand_rate_val = 0.7 job_conservation_flag = True seed = 72 demand_rate = np.array([demand_rate_val])[:, None] buffer_processing_matrix = - np.ones((1, 1)) constituency_matrix = np.ones((1, 1)) list_boundary_constraint_matrices = [constituency_matrix] # Construct environment. job_generator = ScaledBernoulliServicesPoissonArrivalsGenerator( demand_rate, buffer_processing_matrix, job_gen_seed=seed) assert job_generator.routes == {} state_initialiser = stinit.DeterministicCRWStateInitialiser(initial_state) env = RLControlledRandomWalk(cost_per_buffer, capacity, constituency_matrix, job_generator, state_initialiser, job_conservation_flag, list_boundary_constraint_matrices) # Try wrapping environment for tf agents. tf_env = TFPyEnvironment(GymWrapper(env)) del tf_env
def test_tf_environment_with_random(n_episodes=20): """Test tf environment through random actions.""" print(f'Testing tf environment over {n_episodes} episodes.') env = LakeMonsterEnvironment(**params) env = TFPyEnvironment(env) policy = RandomTFPolicy(time_step_spec=env.time_step_spec(), action_spec=env.action_spec()) ts = env.reset() rewards = [] n_steps = [] for _ in tqdm(range(n_episodes)): n_step = 0 while not ts.is_last(): action = policy.action(ts).action ts = env.step(action) n_step += 1 reward = ts.reward rewards.append(reward) n_steps.append(n_step) ts = env.reset() # print results print('average num of steps per episode:', np.mean(n_steps)) print('average reward per episode', np.mean(rewards))
def compile(self, X_train: np.ndarray, y_train: np.ndarray, lr: float, epsilon: float, gamma: float, imb_ratio: float, replay_buffer_max_length: int, layers: dict) -> None: """ Create the Q-network, agent and policy Args: X_train: A np.ndarray for training samples. y_train: A np.ndarray for the class labels of the training samples. lr: learn rate for the optimizer (default Adam) epsilon: Used for the default epsilon greedy policy for choosing a random action. gamma: The discount factor for learning Q-values imb_ratio: ratio of imbalance. Used to specifiy reward in the environment replay_buffer_max_length: Maximum lenght of replay memory. layers: A dict containing the layers of the Q-Network (eg, conv, dense, rnn, dropout). """ dense_layers = layers.get("dense") conv_layers = layers.get("conv") dropout_layers = layers.get("dropout") self.train_env = TFPyEnvironment(ClassifyEnv(X_train, y_train, imb_ratio)) # create a custom environment q_net = QNetwork(self.train_env.observation_spec(), self.train_env.action_spec(), conv_layer_params=conv_layers, fc_layer_params=dense_layers, dropout_layer_params=dropout_layers) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=lr) train_step_counter = tf.Variable(0) self.agent = DqnAgent( self.train_env.time_step_spec(), self.train_env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter, gamma=gamma, epsilon_greedy=epsilon, ) self.agent.initialize() self.replay_buffer = TFUniformReplayBuffer( data_spec=self.agent.collect_data_spec, batch_size=self.train_env.batch_size, max_length=replay_buffer_max_length)
def test_ppo(self): batch_size = 100 steps_per_episode = 13 env = PolicyUnittestEnv(batch_size, steps_per_episode) env = TFPyEnvironment(env) eval_env = PolicyUnittestEnv(batch_size, steps_per_episode) eval_env = TFPyEnvironment(eval_env) algorithm = create_algorithm(env) driver = SyncOffPolicyDriver(env, algorithm, debug_summaries=True, summarize_grads_and_vars=True) replayer = driver.exp_replayer eval_driver = OnPolicyDriver(eval_env, algorithm, training=False, greedy_predict=True) env.reset() eval_env.reset() time_step = driver.get_initial_time_step() policy_state = driver.get_initial_policy_state() for i in range(20): time_step, policy_state = driver.run(max_num_steps=batch_size * steps_per_episode, time_step=time_step, policy_state=policy_state) experience = replayer.replay_all() driver.train(experience, mini_batch_size=25) replayer.clear() eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.info("%d reward=%f", i, float(tf.reduce_mean(eval_time_step.reward))) eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.info("reward=%f", float(tf.reduce_mean(eval_time_step.reward))) self.assertAlmostEqual(1.0, float(tf.reduce_mean(eval_time_step.reward)), delta=1e-1)
def load_model_checkpoint(c):#returns the model at given chkpoint dir_name = tf.train.latest_checkpoint(c.model_dir) #if ver_name =='None': # check_or_make_dir(dir_name) #else: # dir_name = os.path.join(dir_name,ver_name) dummy_env= TFPyEnvironment(StockEnvBasic(**c.default_env)) time_step = dummy_env.reset() temp = ValueNet(**c.model_vars) #initialize model temp(time_step.observation) checkpoint2 = tf.train.Checkpoint(module=temp) status=checkpoint2.restore(dir_name) return temp,checkpoint2
def create_video(py_environment: PyEnvironment, tf_environment: TFPyEnvironment, policy: tf_policy, num_episodes=10, video_filename='imageio.mp4'): print("Generating video %s" % video_filename) with imageio.get_writer(video_filename, fps=60) as video: for episode in range(num_episodes): print("Generating episode %d of %d" % (episode, num_episodes)) time_step = tf_environment.reset() video.append_data(py_environment.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_environment.step(action_step.action) video.append_data(py_environment.render())
def compute_average_reward(env: tf_py_environment.TFPyEnvironment, policy: tf_policy.Base, num_episodes=10) -> float: total_reward = 0 for _ in range(num_episodes): time_step: ts.TimeStep = env.reset() episode_reward = 0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_reward += time_step.reward # print(action_step.action.numpy()[0], end=' ') print(time_step.observation.numpy()) total_reward += episode_reward return total_reward / num_episodes
def evaluate_episode(policy, env_params): """Use naive while loop to evaluate policy in single episode.""" if 'n_monsters' in env_params: env = MultiMonsterEnvironment elif 'is_jumping' in env_params: env = JumpingEnvironment else: env = LakeMonsterEnvironment py_env = env(**env_params) tf_env = TFPyEnvironment(py_env) ts = tf_env.reset() n_steps = 0 while not ts.is_last(): action = policy.action(ts) ts = tf_env.step(action.action) n_steps += 1 reward = ts.reward.numpy().item() return reward, n_steps * py_env.step_size
def create_video(py_environment: PyEnvironment, tf_environment: TFPyEnvironment, policy: tf_policy, num_episodes=10, video_filename='imageio.mp4'): print("Generating video %s" % video_filename) with imageio.get_writer(video_filename, fps=60) as video: for episode in range(num_episodes): episode_return = 0.0 time_step = tf_environment.reset() video.append_data(py_environment.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_environment.step(action_step.action) episode_return += time_step.reward video.append_data(py_environment.render()) print( f"Generated episode {episode} of {num_episodes}. Return:{episode_return} " )
def generic_dqn_agent(env: TFPyEnvironment) -> (dqn_agent.DqnAgent, q_network.QNetwork): """ Function that returns a generic dqn agent args: env (TFPyEnvironment) : The environment the agent will live in Returns: dqn_agent.DqnAgent: The agent to train q_network.QNetwork: The network used in the agent """ inp = env.observation_spec().shape[0] q_net = q_network.QNetwork( env.observation_spec(), env.action_spec(), fc_layer_params=(20,20,20,20,20), activation_fn=tf.keras.activations.relu) optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005) agent = dqn_agent.DqnAgent( env.time_step_spec(), env.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=tf.Variable(0), epsilon_greedy=0.1 ) """def observation_and_action_constraint_splitter(observation): action_mask = [1,1] if observation[0][-1] > 5: action_mask[0] = 1 return observation, tf.convert_to_tensor(action_mask, dtype=np.int32) agent.policy._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter )""" #tf_agents.policies.greedy_policy.GreedyPolicy agent.initialize() return agent, q_net
def step_episode( environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer ) -> typing.Tuple[int, int]: done = False environment.reset() curr_episode_rewards = [] episode_reward = 0 episode_length = 0 while not done: reward, done = step(environment, policy, replay_buffer) curr_episode_rewards.append(reward) episode_length += 1 if done: episode_reward = sum(curr_episode_rewards) return episode_reward, episode_length
def compute_average_return(env: tf_py_environment.TFPyEnvironment, policy, num_episodes: int = 1) -> float: total_return = 0.0 for _ in range(num_episodes): time_step_ = env.reset() episode_return = 0.0 while not any(time_step_.is_last()): action_step = policy.action(time_step_) time_step_ = env.step(action=action_step.action) episode_return += np.mean(time_step_.reward) total_return += episode_return average_return = total_return / num_episodes return average_return