def testWithMockAgent_DoneAllFalse(self): total_timesteps = 3 batch_size = 4 done = np.array([[False, False, False, False], [False, False, False, False], [False, False, False, False]]) init_state = tf.constant([[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], [4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]]) env_output = common.EnvOutput( reward=None, done=done, observation={ _OBS_KEY_1: tf.ones([total_timesteps, batch_size, 50]), _OBS_KEY_0: tf.zeros([total_timesteps, batch_size, 50]), }, info=None) agent = MockAgent(3, 4, init_state, done) agent.reset_timestep() agent_output, final_state = agent(env_output, init_state) np.testing.assert_array_almost_equal( np.zeros((total_timesteps, batch_size, 4)), agent_output.policy_logits) np.testing.assert_array_almost_equal( np.ones((total_timesteps, batch_size)), agent_output.baseline) expected_final_state = np.array( [[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0], [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0], [6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0], [7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0]]) np.testing.assert_array_almost_equal(expected_final_state, final_state.numpy())
def setUp(self): super(DiscriminatorTest, self).setUp() self.data_dir = FLAGS.test_srcdir + ('valan/r2r/testdata') self._env_config = hparam.HParams( problem='R2R', scan_base_dir=self.data_dir, data_base_dir=self.data_dir, vocab_file='vocab.txt', images_per_pano=36, max_conns=14, image_encoding_dim=2052, image_features_dir=os.path.join(self.data_dir, 'image_features'), instruction_len=50, max_agent_actions=6, reward_fn=env_config.RewardFunction.get_reward_fn( 'distance_to_goal')) self._runtime_config = common.RuntimeConfig(task_id=0, num_tasks=100) self._env = env.R2REnv(data_sources=['R2R_small_split'], runtime_config=self._runtime_config, env_config=self._env_config) self.num_panos = 36 self.image_feature_size = 2052 self.num_actions = 14 self.time_step = 3 self.batch_size = 1 done = np.array([[True], [False], [True]]) done = np.reshape(done, [3, 1]) self._test_environment = common.EnvOutput( reward=0, done=done, observation={ constants.IS_START: np.array([[True], [False], [True]]), constants.DISC_MASK: np.array([[True], [False], [True]]), constants.PANO_ENC: tf.random.normal([ self.time_step, self.batch_size, self.num_panos, self.image_feature_size ]), constants.CONN_ENC: tf.random.normal([ self.time_step, self.batch_size, self.num_actions, self.image_feature_size ]), constants.INS_TOKEN_IDS: np.array([[[3, 6, 1, 0, 0]], [[3, 6, 1, 0, 0]], [[3, 6, 1, 0, 0]]]), constants.VALID_CONN_MASK: np.array([[[True] * 14], [[True] * 5 + [False] * 9], [[True] * 2 + [False] * 12]]) }, info='') self._agent = discriminator_agent.DiscriminatorAgent( agent_config.get_r2r_agent_config())
def _reset(self): self._frame_count = 0 # Switch to a new, random region among this worker's regions region_idx = np.random.randint(0, len(self._all_regions)) self._current_region = self._all_regions[region_idx] self._graph = self._all_graphs[self._current_region] # Switch to a new, random instruction sequence from this region self._current_sequence_idx = np.random.randint( 0, len(self._all_entry_sequences[self._current_region])) current_entry_sequence = (self._all_entry_sequences[ self._current_region][self._current_sequence_idx]) # Switch to a new, random instruction from this instruction sequence self._current_entry_idx = np.random.randint( 0, len(current_entry_sequence)) current_entry = current_entry_sequence[self._current_entry_idx] # The graph uses different action representations depending on action space # Convert to integer indices - the unifying representation used here golden_actions, self._golden_path = self._graph.get_golden_actions( current_entry) self._golden_actions = self._convert_golden_actions(golden_actions) self._goal_pano_id = current_entry.route[-1] self._graph_state = GraphState(current_entry.route[0], current_entry.start_heading, 0, 0) self._distance_to_goal = self.shortest_path_length( self._graph_state.pano_id, self._goal_pano_id) if self._run_writer: self._run_writer.reset() self._run_writer.log_run_data({ 'region': self._current_region, 'route_id': current_entry.route_id, 'segment_idx': self._current_entry_idx }) return common.EnvOutput( reward=np.float32(0.), done=False, # We randomly choose prev_action_idx at the beginning of every episode. observation=self._get_current_observation( prev_action=np.random.choice( self._panoramic_action_bins + 1 if self._panoramic_actions else streetview_constants. NUM_DISCRETE_ACTIONS)), info=self._get_step_info())
def _step(self, action): """Updates the state using the provided action. Sets `done=True` if either this action corresponds to stop node or the budget for the current episode is exhausted. Args: action: An integer specifying the next pano id. Returns: A tuple `EnvOutput`. """ # First check this is a valid action. assert action >= 0 current_observations = self.get_current_env_output().observation current_pano_id = current_observations[constants.PANO_ID] current_scan_id = current_observations[constants.SCAN_ID] current_time_step = current_observations[constants.TIME_STEP] assert action in self._scan_info[current_scan_id].conn_ids[ current_pano_id] next_pano_id = action self._path_history.append(next_pano_id) done = False if (next_pano_id == constants.STOP_NODE_ID or current_time_step == self._max_actions_per_episode): done = True problem_type = self._paths[self._current_idx]['problem_type'] if problem_type == constants.PROBLEM_VLN: reward = np.float32( self._compute_vln_reward( path_history=self._path_history[:-1], next_pano=next_pano_id, golden_path=self._paths[self._current_idx]['path'], end_of_episode=done, scan_info=self._scan_info[current_scan_id])) elif problem_type == constants.PROBLEM_NDH: reward = np.float32( self._compute_ndh_reward( path_history=self._path_history[:-1], next_pano=next_pano_id, golden_path=self._paths[self._current_idx]['path'], end_of_episode=done, scan_info=self._scan_info[current_scan_id], goal_room_panos=self._paths[ self._current_idx]['end_panos'])) else: raise ValueError('Invalid problem_type: {}.'.format(problem_type)) return common.EnvOutput(reward=reward, done=done, observation=self._get_current_observation( next_pano_id, current_scan_id, current_time_step + 1), info='')
def _step(self, action): assert action in self._action_space new_state = self._current_state + action done = False if new_state < 0 or new_state == self._state_space_size: new_state = -100 # STOP done = True self._current_state = new_state return common.EnvOutput(reward=float(action), done=done, observation=self._get_current_observation(), info='')
def _reset(self): """Reset the environment with new data. Returns: A instance of common.EnvOutput, which is the initial Observation. """ self._current_idx = self._get_next_idx(self._current_idx) current_scan_id = self._paths[self._current_idx]['scan_id'] current_pano_id = self._scan_info[current_scan_id].pano_name_to_id[ self._paths[self._current_idx]['path'][0]] self._path_history = [current_pano_id] return common.EnvOutput(reward=np.float32(0.0), done=False, observation=self._get_current_observation( current_pano_id, current_scan_id, 0), info='')
def _step(self, action): """Updates the state using the provided action. Sets `done=True` if either this action corresponds to stop node or the budget for the current episode is exhausted. Args: action: An integer specifying the next pano id. Returns: A tuple `EnvOutput`. """ # First check this is a valid action. assert action >= 0 current_observations = self.get_current_env_output().observation current_pano_id = current_observations[constants.PANO_ID] current_scan_id = current_observations[constants.SCAN_ID] current_time_step = current_observations[constants.TIME_STEP] # Sanity check. if not (constants.LABEL in current_observations and current_observations[constants.LABEL] == 0): # Panos must be connected when label = 1 or label does not exist. if action not in self._scan_info[current_scan_id].conn_ids[ current_pano_id]: raise ValueError('Current and next panos must be connected.') next_pano_id = action self._path_history.append(next_pano_id) done = False if (next_pano_id == constants.STOP_NODE_ID or current_time_step == self._max_actions_per_episode): done = True return common.EnvOutput( reward=np.float32( self._compute_reward( path_history=self._path_history[:-1], next_pano=next_pano_id, golden_path=self._current_path_dict['path'], end_of_episode=done, scan_info=self._scan_info[current_scan_id])), done=done, observation=self._get_current_observation(next_pano_id, current_scan_id, current_time_step + 1), info='')
def setUp(self): super(AgentR2RTest, self).setUp() self.num_panos = 36 self.image_feature_size = 64 + 256 self.num_actions = 14 self.time_step = 3 self.batch_size = 1 done = np.array([[True], [False], [True]]) self._test_environment = common.EnvOutput( reward=None, done=done, observation={ constants.HEADING: np.zeros([self.time_step, self.batch_size, 1]), constants.PITCH: np.zeros([self.time_step, self.batch_size, 1]), constants.PANO_ENC: tf.random.normal([ self.time_step, self.batch_size, self.num_panos, self.image_feature_size ]), constants.CONN_ENC: tf.random.normal([ self.time_step, self.batch_size, self.num_actions, self.image_feature_size ]), constants.PREV_ACTION_ENC: tf.random.normal([ self.time_step, self.batch_size, self.image_feature_size ]), constants.INS_TOKEN_IDS: np.array([ [[3, 6, 1, 0, 0]], [[3, 6, 1, 0, 0]], [[3, 6, 1, 0, 0]], ]), constants.VALID_CONN_MASK: np.array([ [[1.0] * 14], [[1.0] * 5 + [0.0] * 9], [[1.0] * 2 + [0.0] * 12], ]) }, info='')
def __init__(self, state_space_size, unroll_length=1): self._state_space_size = state_space_size # Creates simple dynamics (T stands for transition): # states = [0, 1, ... len(state_space_size - 1)] + [STOP] # actions = [-1, 1] # T(s, a) = s + a iff (s + a) is a valid state # = STOP otherwise self._action_space = [-1, 1] self._current_state = None self._env_spec = common.EnvOutput( reward=tf.TensorSpec(shape=[unroll_length + 1], dtype=tf.float32), done=tf.TensorSpec(shape=[unroll_length + 1], dtype=tf.bool), observation={ 'f1': tf.TensorSpec(shape=[unroll_length + 1, 4, 10], dtype=tf.float32), 'f2': tf.TensorSpec(shape=[unroll_length + 1, 7, 10, 2], dtype=tf.float32) }, info=tf.TensorSpec(shape=[unroll_length + 1], dtype=tf.string))
def setUp(self): super(DiscriminatorTest, self).setUp() self.data_dir = FLAGS.test_srcdir + ( 'valan/r2r/testdata') self._env_config = hparam.HParams( problem='R2R', scan_base_dir=self.data_dir, data_base_dir=self.data_dir, vocab_dir=self.data_dir, vocab_file='vocab.txt', images_per_pano=36, max_conns=14, image_encoding_dim=64, direction_encoding_dim=256, image_features_dir=os.path.join(self.data_dir, 'image_features'), instruction_len=50, max_agent_actions=6, project_decoder_input_states=True, use_all_final_states=False, reward_fn=env_config.RewardFunction.get_reward_fn('distance_to_goal')) self._runtime_config = common.RuntimeConfig(task_id=0, num_tasks=100) self._env = env.R2REnv( data_sources=['R2R_small_split'], runtime_config=self._runtime_config, env_config=self._env_config) self.num_panos = 36 self.image_feature_size = 64 self.direction_encoding_dim = 256 self.num_actions = 14 self.time_step = 3 self.batch_size = 2 done = np.array([[False, True], [True, False], [True, False]]) self._test_environment = common.EnvOutput( reward=0, done=done, observation={ constants.PATH_ID: # Shape = [time, batch] np.array([[2, 1], [0, 1], [0, 1]]), constants.IS_START: # Shape = [time, batch] np.array([[False, True], [True, False], [False, False]]), constants.DISC_MASK: # Shape = [time, batch] np.array([[False, True], [True, True], [True, True]]), constants.PANO_ENC: # Shape = [time, batch, num_panos, featur_size] tf.random.normal([ self.time_step, self.batch_size, self.num_panos, self.image_feature_size + self.direction_encoding_dim ]), constants.CONN_ENC: # Shape = [time, batch, num_actions, feature_size] tf.random.normal([ self.time_step, self.batch_size, self.num_actions, self.image_feature_size + self.direction_encoding_dim ]), constants.PREV_ACTION_ENC: # Shape = [time, batch, feature_size] tf.random.normal([ self.time_step, self.batch_size, self.image_feature_size + self.direction_encoding_dim ]), constants.NEXT_GOLDEN_ACTION_ENC: # Shape = [time, batch, feature_size] tf.random.normal([ self.time_step, self.batch_size, self.image_feature_size + self.direction_encoding_dim ]), constants.INS_TOKEN_IDS: # Shape = [time, batch, token_len] np.array([[[5, 3, 2, 1, 0], [3, 4, 5, 6, 1]], [[3, 6, 1, 0, 0], [3, 4, 5, 6, 1]], [[3, 6, 1, 0, 0], [3, 4, 5, 6, 1]]]), constants.INS_LEN: # Shape = [time, batch] np.tile(np.array([[3]]), [self.time_step, self.batch_size]), constants.VALID_CONN_MASK: # Shape = [time, batch, num_connections] np.tile( np.array([[[True] * 14], [[True] * 5 + [False] * 9], [[True] * 2 + [False] * 12]]), [1, self.batch_size, 1]), constants.LABEL: # Shape = [time, batch] np.array([[False, False], [True, False], [True, False]]) }, info='') self._agent_config = agent_config.get_r2r_agent_config()
def setUp(self): super(EvalMetricTest, self).setUp() self.data_dir = FLAGS.test_srcdir + ('valan/r2r/testdata') self._env_config = hparam.HParams( problem='R2R', base_path=self.data_dir, vocab_file='vocab.txt', images_per_pano=36, max_conns=14, image_encoding_dim=2052, image_features_dir=os.path.join(self.data_dir, 'image_features'), instruction_len=50, max_agent_actions=6, reward_fn=env_config.RewardFunction.get_reward_fn( 'distance_to_goal')) self._runtime_config = common.RuntimeConfig(task_id=0, num_tasks=1) self._env = env.R2REnv(data_sources=['small_split'], runtime_config=self._runtime_config, env_config=self._env_config) # scan: gZ6f7yhEvPG # Path: 1, 3, 7, 5, 2 self._golden_path = [1, 4, 6, 2] self._scan_id = 0 # testdata has single scan only 'gZ6f7yhEvPG' self._env_list = [ common.EnvOutput(reward=0, done=None, observation={ constants.PANO_ID: 1, constants.GOLDEN_PATH: self._golden_path, constants.GOAL_PANO_ID: 2, constants.SCAN_ID: self._scan_id, constants.GOAL_ROOM_PANOS: [6, 2, constants.INVALID_NODE_ID] }, info=None), common.EnvOutput(reward=1, done=None, observation={ constants.PANO_ID: 3, constants.GOLDEN_PATH: self._golden_path, constants.GOAL_PANO_ID: 2, constants.SCAN_ID: self._scan_id, constants.GOAL_ROOM_PANOS: [6, 2, constants.INVALID_NODE_ID] }, info=None), common.EnvOutput(reward=1, done=None, observation={ constants.PANO_ID: 7, constants.GOLDEN_PATH: self._golden_path, constants.GOAL_PANO_ID: 2, constants.SCAN_ID: self._scan_id, constants.GOAL_ROOM_PANOS: [6, 2, constants.INVALID_NODE_ID] }, info=None), common.EnvOutput(reward=1, done=None, observation={ constants.PANO_ID: 5, constants.GOLDEN_PATH: self._golden_path, constants.GOAL_PANO_ID: 2, constants.SCAN_ID: self._scan_id, constants.GOAL_ROOM_PANOS: [6, 2, constants.INVALID_NODE_ID] }, info=None), common.EnvOutput(reward=1, done=False, observation={ constants.PANO_ID: 2, constants.GOLDEN_PATH: self._golden_path, constants.GOAL_PANO_ID: 2, constants.SCAN_ID: self._scan_id, constants.GOAL_ROOM_PANOS: [6, 2, constants.INVALID_NODE_ID] }, info=None), common.EnvOutput( reward=4, # success done=True, # end of episode # next episode's observation. observation={ constants.PANO_ID: 11, constants.GOLDEN_PATH: self._golden_path, constants.GOAL_PANO_ID: 2, constants.SCAN_ID: self._scan_id, constants.GOAL_ROOM_PANOS: [6, 2, constants.INVALID_NODE_ID] }, info=None), ] self._action_list = [3, 7, 5, 2, 0]
def __init__(self): self._current_env_output = common.EnvOutput(reward=None, done=True, observation=None, info=None)
def _step(self, action): """Steps the environment. Args: action: If not using panoramic actions, this is an integer index 0 to 3 If using panoramic actions, this is an index 0 to K, where values 0 to K-1 are angle bins from -180 to 180, and K is the stop action Returns: The next environment output. """ self._frame_count += 1 # Update distance to goal. Note that at every step, self._distance_to_goal # is the distance to goal from the pano we reached in the prev step. self._distance_to_goal = self.shortest_path_length( self._graph_state.pano_id, self._goal_pano_id) if not isinstance(action, np.ndarray): action = np.array(action, dtype=np.int64) # Convert action idx to representation used by the graph if self._panoramic_actions: # go_towards is 'stop' or heading angle in degrees go_towards = ('stop' if action >= self._panoramic_action_bins else self._pano_action_bin_to_heading(action)) else: # go_towards is left, right, forward or stop. go_towards = self.ACTION_IDX_TO_STR[int(action)] reward = 0. done = False if go_towards == 'stop': # If the action is to stop. reward = 1. if self._goal_pano_id == self._graph_state.pano_id else -1. done = True else: # Else, take the step. next_graph_state = self._graph.get_next_graph_state( self._graph_state, go_towards) if len(self._graph.nodes[next_graph_state.pano_id].neighbors) < 2: # stay still when running into the boundary of the graph logging.info( 'At the border (number of neighbors < 2). Did not go %s.', str(go_towards)) done = True else: prev_state_potential = self.shortest_path_length( self._graph_state.pano_id, self._goal_pano_id) cur_state_potential = self.shortest_path_length( next_graph_state.pano_id, self._goal_pano_id) dist_reward = prev_state_potential - cur_state_potential reward = self._DEFAULT_STEP_REWARD + dist_reward self._graph_state = next_graph_state if self._frame_count > self._max_actions_per_episode: done = True observation = self._get_current_observation(prev_action=action) # Log transition: if self._run_writer: self._run_writer.log_action(action, go_towards) self._run_writer.log_state(self._graph_state) self._run_writer.log_observation(observation) if done: self._run_writer.write() return common.EnvOutput(reward=reward, done=done, observation=observation, info=self._get_step_info())
def _reset(self): self._current_state = 0 # always start at state=0 return common.EnvOutput(reward=0., done=False, observation=self._get_current_observation(), info='')