def _setup_specs(self): self._policy_step_spec = policy_step.PolicyStep( action=self._action_spec, state=self._policy_state_spec, info=self._info_spec) self._trajectory_spec = trajectory.from_transition(self._time_step_spec, self._policy_step_spec, self._time_step_spec)
def update(self, observation, reward, is_terminal, action_step): time_step = self.env.current_time_step() self.train_py_env.set_next(observation, reward, is_terminal) next_time_step = self.env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) self.replay_buffer.add_batch(traj) self.update_network(traj)
def collect_step(environment, policy, replay_buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # batch = tf.nest.map_structure(lambda t: tf.expand_dims(t, 0), traj) # Add trajectory to the replay buffer replay_buffer.add_batch(traj)
def collect(self, env, policy): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) self.buffer.add_batch(traj) return traj
def collect_step(env, time_step, py_policy, replay_buffer): """Steps the environment and collects experience into the replay buffer.""" action_step = py_policy.action(time_step) next_time_step = env.step(action_step.action) if not time_step.is_last(): traj = trajectory.from_transition(time_step, action_step, next_time_step) replay_buffer.add_batch(traj) return next_time_step
def collect_step(environment, policy, buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj)
def timestamp_data(self, environment, policy): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) timestamp_trajectory = trajectory.from_transition( time_step, action_step, next_time_step) self._replay_buffer.add_batch(timestamp_trajectory)
def collect_data(env, policy, buffer, steps): for _ in range(steps): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traject = trajectory.from_transition(time_step, action_step, next_time_step) buffer.add_batch(traject)
def collect_step(self, env: tf_py_environment.TFPyEnvironment, policy: tf_policy.Base, replay_buffer: TFUniformReplayBuffer): time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) replay_buffer.add_batch(traj)
def predict_and_fine_tune(self): prev_step = self.time_step action_step = q_agent.agent.policy.action(self.time_step) self.time_step = fine_tune_env.step(action_step.action) traj = trajectory.from_transition(prev_step, action_step, self.time_step) replay_buffer.add_batch(traj) experience, _ = next(iterator) train_loss = q_agent.agent.train(experience) return action_step.action
def collect_step(environment, policySteps, buffer, alg): time_step, total_agents_action, next_time_step = one_step( environment, policySteps, alg) traj = trajectory.from_transition(time_step, total_agents_action, next_time_step, alg=alg, joint_action=True) buffer.add_batch(traj)
def collect_step(environment, policy, buffer, drone): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) options.drones_locations[drone] = next_time_step.observation traj = trajectory.from_transition( time_step, action_step, next_time_step) buffer.add_batch(traj)
def collect_step(self): time_step = self.env.current_time_step() action_step = self.agent.policy.action(time_step) next_time_step = self.env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer self.replay_buffer.add_batch(traj)
def collect_step(environment, policy, buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer, with extra copies per 0.001 reward (jury-rigged experience prioritisation weighting) for i in range(int((traj.reward // 0.001) + 1)): buffer.add_batch(traj)
def step( environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer ) -> typing.Tuple[float, bool]: time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) replay_buffer.add_batch(traj) return next_time_step.reward.numpy()[0], next_time_step.is_last()
def run_agent(self, policy, recorder, env_name, py_env, tf_env, encoded_images=None): """Run an agent's policy in a particular environment. Possibly record.""" if self.save_matrices and encoded_images is None: encoded_images = [] # Add blank frames to make it easier to distinguish between runs/agents for _ in range(self.num_blank_frames): if recorder: recorder.add_frame(self.blank_frame) if self.save_matrices: encoded_images.append(self.blank_frame_encoding) rewards = 0 policy_state = policy.get_initial_state(1) if 'domain_randomization' in self.name and env_name == self.adv_env_name: time_step = tf_env.reset_random() elif 'Adversarial' in env_name: time_step = tf_env.reset_agent() else: time_step = tf_env.reset() if recorder: recorder.add_frame(py_env.render()) if self.save_matrices: encoded_images.append(self.py_env._gym_env.grid.encode()) # pylint:disable=protected-access num_steps = tf.constant(0.0) while True: policy_step = policy.action(time_step, policy_state=policy_state) policy_state = policy_step.state next_time_step = tf_env.step(policy_step.action) traj = trajectory.from_transition(time_step, policy_step, next_time_step) time_step = next_time_step num_steps += tf.math.reduce_sum( tf.cast(~traj.is_boundary(), tf.float32)) rewards += time_step.reward if recorder: recorder.add_frame(py_env.render()) if self.save_matrices: encoded_images.append(self.py_env._gym_env.grid.encode()) # pylint:disable=protected-access if traj.is_last(): break return rewards.numpy().sum(), encoded_images
def loop_body(counter, time_step, policy_state): """Runs a step in environment. While loop will call multiple times. Args: counter: Episode counters per batch index. Shape [batch_size]. time_step: TimeStep tuple with elements shape [batch_size, ...]. policy_state: Poicy state tensor shape [batch_size, policy_state_dim]. Pass empty tuple for non-recurrent policies. Returns: loop_vars for next iteration of tf.while_loop. """ action_step = self.policy.action(time_step, policy_state) # TODO(b/134487572): TF2 while_loop seems to either ignore # parallel_iterations or doesn't properly propagate control dependencies # from one step to the next. Without this dep, self.env.step() is called # in parallel. with tf.control_dependencies(tf.nest.flatten([time_step])): next_time_step = self.env.step(action_step.action) policy_state = action_step.state if self._is_bandit_env: # For Bandits we create episodes of length 1. # Since the `next_time_step` is always of type LAST we need to replace # the step type of the current `time_step` to FIRST. batch_size = tf.shape(input=time_step.discount) time_step = time_step._replace( step_type=tf.fill(batch_size, ts.StepType.FIRST)) traj = trajectory.from_transition(time_step, action_step, next_time_step) #### Save trajectory into the server's table #### print('Trajectory: {}'.format(traj)) self.sampling_client.insert(traj, {'my_table': 1.0}) ################################################ observer_ops = [observer(traj) for observer in self._observers] transition_observer_ops = [ observer((time_step, action_step, next_time_step)) for observer in self._transition_observers ] with tf.control_dependencies( [tf.group(observer_ops + transition_observer_ops)]): time_step, next_time_step, policy_state = tf.nest.map_structure( tf.identity, (time_step, next_time_step, policy_state)) # While loop counter is only incremented for episode reset episodes. # For Bandits, this is every trajectory, for MDPs, this is at boundaries. if self._is_bandit_env: counter += tf.ones(batch_size, dtype=tf.int32) else: counter += tf.cast(traj.is_boundary(), dtype=tf.int32) return [counter, next_time_step, policy_state]
def sample(self, batch_size): dummy_action_step = policy_step.PolicyStep( action=tf.constant([tf.int32.min])) dummy_time_step = ts.TimeStep(step_type=tf.constant([tf.int32.min]), reward=(np.nan * tf.ones(1)), discount=(np.nan * tf.ones(1)), observation=None) trajs = [] for transition in random.sample(self.buffer, batch_size): traj1 = trajectory.from_transition(transition.time_step, transition.action_step, transition.next_time_step) traj2 = trajectory.from_transition(transition.next_time_step, dummy_action_step, dummy_time_step) trajs.append( nest_utils.unbatch_nested_tensors( nest_utils.stack_nested_tensors([traj1, traj2], axis=1))) return nest_utils.stack_nested_tensors(trajs)
def _data_spec(self): return trajectory.from_transition( self.time_step_spec, policy_step.PolicyStep( action=self.action_spec, state=self.policy_state_spec, info=() ), self.time_step_spec, )
def collect_step(self, env, policy, buffer, mac): """Collects the current time step of the environment and maps the current time_step to action in Q-table. """ if self.initial_step[mac]: self.initial_step[mac] = False time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) buffer.add_batch(traj) time_step = env.current_time_step() action_step = policy.action(time_step) next_time_step = env.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) buffer.add_batch(traj)
def collect_step(environment, policy, buffer): """Returns reward and termination.""" time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj) return next_time_step.reward[0], next_time_step.is_last()
def collect_step(env: tf_py_environment.TFPyEnvironment, policy, buffer): time_step_ = env.current_time_step() action_step = policy.action(time_step_) next_time_step = env.step(action=action_step.action) traj = trajectory.from_transition(time_step=time_step_, action_step=action_step, next_time_step=next_time_step) # add trajectory to the replay buffer buffer.add_batch(traj)
def loop_body(counter, time_step, policy_state): """Runs a step in environment. While loop will call multiple times. Args: counter: Episode counters per batch index. Shape [batch_size]. time_step: TimeStep tuple with elements shape [batch_size, ...]. policy_state: Poicy state tensor shape [batch_size, policy_state_dim]. Pass empty tuple for non-recurrent policies. Returns: loop_vars for next iteration of tf.while_loop. """ action_step = self.policy.action(time_step, policy_state) # TODO(b/134487572): TF2 while_loop seems to either ignore # parallel_iterations or doesn't properly propagate control dependencies # from one step to the next. Without this dep, self.env.step() is called # in parallel. with tf.control_dependencies(tf.nest.flatten([time_step])): if isinstance( self.policy, (discrete_boltzmann_policy.DiscreteBoltzmannPolicy, greedy_policy.GreedyPolicy, epsilon_discrete_boltzmann_policy. EpsilonDiscreteBoltzmannPolicy)): next_time_step = self.env.step( tf.cast(action_step.action[:, 0], tf.int64)) else: next_time_step = self.env.step(action_step.action) # with tf.control_dependencies(tf.nest.flatten([time_step])): # next_time_step = self.env.step(action_step.action) policy_state = action_step.state traj = trajectory.from_transition(time_step, action_step, next_time_step) observer_ops = [observer(traj) for observer in self._observers] transition_observer_ops = [ observer((time_step, action_step, next_time_step)) for observer in self._transition_observers ] with tf.control_dependencies( [tf.group(observer_ops + transition_observer_ops)]): time_step, next_time_step, policy_state = tf.nest.map_structure( tf.identity, (time_step, next_time_step, policy_state)) # While loop counter is only incremented for episode reset episodes. counter += tf.cast(traj.is_boundary(), dtype=tf.int32) return [counter, next_time_step, policy_state]
def get_tf_buffers(c,max_length=270): obs_spec,ac_spec = get_env_specs(c) time_step_spec = ts.time_step_spec(obs_spec) action_spec = policy_step.PolicyStep(ac_spec) trajectory_spec = trajectory.from_transition( time_step_spec, action_spec , time_step_spec) the_replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=trajectory_spec, batch_size=1, max_length=max_length) return the_replay_buffer
def collect_step(environment, policy, buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) action_num = action_step.action.numpy()[0] # print(env.map_action_to_coordinate(action_num - 3136 * 4)) # print(floor(action_num/3136)) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj)
def collect_step(environment, policy): """ Fuehrt auf auf dem aktuellen gamestate in environment 1 action aus (bestimmt durch policy) und speichert das ergebnis als trajectory in 'replay_buffer' """ time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer replay_buffer.add_batch(traj)
def _make_replay_buffer(self, tf_env): """Default replay buffer factory.""" time_step_spec = tf_env.time_step_spec() action_spec = tf_env.action_spec() action_step_spec = policy_step.PolicyStep( action_spec, (), tensor_spec.TensorSpec((), tf.int32)) trajectory_spec = trajectory.from_transition(time_step_spec, action_step_spec, time_step_spec) return episodic_replay_buffer.EpisodicReplayBuffer( trajectory_spec, end_episode_fn=lambda _: False)
def add_to_replay_buffer(transition): time_step, _, next_time_step = transition time_step = prepare(time_step) next_time_step = prepare(next_time_step) action_step = policy.action(time_step) traj = trajectory.from_transition(time_step, action_step, next_time_step) traj_batched = tf.nest.map_structure( lambda t: tf.stack([t] * batch_size), traj) replay_buffer.add_batch(traj_batched)
def collect_step(environment, policy, buffer): """Execute the step in the environment and add it to the buffer. environment (object): Environment of the game. policy (object): network. buffer (object): data for training. """ time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) buffer.add_batch(traj)
def collect_data(environment, policy, num_episodes): episode_counter = 0 environment.reset() while episode_counter < num_episodes: time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer replay_buffer.add_batch(traj) if traj.is_boundary(): episode_counter += 1