def test_int_key_sparse_to_dense(self): # int keys, set_missing_value_to_zero=False processor = PythonSparseToDenseProcessor( self.sorted_features, set_missing_value_to_zero=False) value, presence = processor.process(self.int_keyed_sparse_data) assert torch.allclose(value, self.expected_value_missing) assert torch.all(presence == self.expected_presence_missing)
def __init__(self, model) -> None: self.model = model self.state_internal_sparse_to_dense = PythonSparseToDenseProcessor( self.model.state_sorted_features()) self.action_internal_sparse_to_dense = PythonSparseToDenseProcessor( self.model.action_sorted_features()) self.softmax_temperature: Optional[float] = None
def test_create_df_from_replay_buffer(self): env_name = "MiniGrid-Empty-5x5-v0" env = Gym(env_name=env_name) state_dim = env.observation_space.shape[0] # Wrap env in TestEnv env = TestEnv(env) problem_domain = ProblemDomain.DISCRETE_ACTION DATASET_SIZE = 1000 multi_steps = None DS = "2021-09-16" # Generate data df = create_df_from_replay_buffer( env=env, problem_domain=problem_domain, desired_size=DATASET_SIZE, multi_steps=multi_steps, ds=DS, shuffle_df=False, ) self.assertEqual(len(df), DATASET_SIZE) # Check data preprocessor = PythonSparseToDenseProcessor(list(range(state_dim))) for idx, row in df.iterrows(): df_mdp_id = row["mdp_id"] env_mdp_id = str(env.sart[idx][0]) self.assertEqual(df_mdp_id, env_mdp_id) df_seq_num = row["sequence_number"] env_seq_num = env.sart[idx][1] self.assertEqual(df_seq_num, env_seq_num) df_state = preprocessor.process([row["state_features"] ])[0][0].numpy() env_state = env.sart[idx][2] npt.assert_array_equal(df_state, env_state) df_action = row["action"] env_action = str(env.sart[idx][3]) self.assertEqual(df_action, env_action) df_terminal = row["next_action"] == "" env_terminal = env.sart[idx][5] self.assertEqual(df_terminal, env_terminal) if not df_terminal: df_reward = float(row["reward"]) env_reward = float(env.sart[idx][4]) npt.assert_allclose(df_reward, env_reward) df_next_state = preprocessor.process( [row["next_state_features"]])[0][0].numpy() env_next_state = env.sart[idx + 1][2] npt.assert_array_equal(df_next_state, env_next_state) df_next_action = row["next_action"] env_next_action = str(env.sart[idx + 1][3]) self.assertEqual(df_next_action, env_next_action) else: del env.sart[idx + 1]
def preprocess_samples( self, samples: Samples, minibatch_size: int, use_gpu: bool = False, one_hot_action: bool = True, normalize_actions: bool = True, ) -> List[TrainingDataPage]: logger.info("Shuffling...") samples = shuffle_samples(samples) logger.info("Sparse2Dense...") sorted_state_features, _ = sort_features_by_normalization(self.normalization) sorted_action_features, _ = sort_features_by_normalization( self.normalization_action ) state_sparse_to_dense_processor = PythonSparseToDenseProcessor( sorted_state_features ) action_sparse_to_dense_processor = PythonSparseToDenseProcessor( sorted_action_features ) state_matrix, state_matrix_presence = state_sparse_to_dense_processor( samples.states ) next_state_matrix, next_state_matrix_presence = state_sparse_to_dense_processor( samples.next_states ) action_matrix, action_matrix_presence = action_sparse_to_dense_processor( samples.actions ) ( next_action_matrix, next_action_matrix_presence, ) = action_sparse_to_dense_processor(samples.next_actions) action_probabilities = torch.tensor( samples.action_probabilities, dtype=torch.float32 ).reshape(-1, 1) rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1) max_action_size = 4 pnas_mask_list: List[List[int]] = [] pnas_flat: List[Dict[str, float]] = [] for pnas in samples.possible_next_actions: pnas_mask_list.append([1] * len(pnas) + [0] * (max_action_size - len(pnas))) pnas_flat.extend(pnas) for _ in range(max_action_size - len(pnas)): pnas_flat.append({}) # Filler pnas_mask = torch.Tensor(pnas_mask_list) ( possible_next_actions_matrix, possible_next_actions_matrix_presence, ) = action_sparse_to_dense_processor(pnas_flat) logger.info("Preprocessing...") state_preprocessor = Preprocessor(self.normalization, False) action_preprocessor = Preprocessor(self.normalization_action, False) states_ndarray = state_preprocessor(state_matrix, state_matrix_presence) if normalize_actions: actions_ndarray = action_preprocessor(action_matrix, action_matrix_presence) else: actions_ndarray = action_matrix next_states_ndarray = state_preprocessor( next_state_matrix, next_state_matrix_presence ) state_pnas_tile = next_states_ndarray.repeat(1, max_action_size).reshape( -1, next_states_ndarray.shape[1] ) if normalize_actions: next_actions_ndarray = action_preprocessor( next_action_matrix, next_action_matrix_presence ) else: next_actions_ndarray = next_action_matrix if normalize_actions: logged_possible_next_actions = action_preprocessor( possible_next_actions_matrix, possible_next_actions_matrix_presence ) else: logged_possible_next_actions = possible_next_actions_matrix assert state_pnas_tile.shape[0] == logged_possible_next_actions.shape[0], ( "Invalid shapes: " + str(state_pnas_tile.shape) + " != " + str(logged_possible_next_actions.shape) ) logged_possible_next_state_actions = torch.cat( (state_pnas_tile, logged_possible_next_actions), dim=1 ) logger.info("Reward Timeline to Torch...") time_diffs = torch.ones([len(samples.states), 1]) tdps = [] pnas_start = 0 logger.info("Batching...") for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break pnas_end = pnas_start + (minibatch_size * max_action_size) tdp = TrainingDataPage( states=states_ndarray[start:end], actions=actions_ndarray[start:end], propensities=action_probabilities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], next_actions=next_actions_ndarray[start:end], not_terminal=(pnas_mask[start:end, :].sum(dim=1, keepdim=True) > 0), time_diffs=time_diffs[start:end], possible_next_actions_mask=pnas_mask[start:end, :], possible_next_actions_state_concat=logged_possible_next_state_actions[ pnas_start:pnas_end, : ], ) pnas_start = pnas_end tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor) tdps.append(tdp) return tdps
def __init__(self, model, action_feature_ids: List[int]) -> None: self.model = model self.internal_sparse_to_dense = PythonSparseToDenseProcessor( self.model.state_sorted_features()) self.action_feature_ids = action_feature_ids
def preprocess_samples_discrete( self, samples: Samples, minibatch_size: int, one_hot_action: bool = True, use_gpu: bool = False, do_shuffle: bool = True, ) -> List[TrainingDataPage]: if do_shuffle: logger.info("Shuffling...") samples = shuffle_samples(samples) logger.info("Preprocessing...") sorted_features, _ = sort_features_by_normalization(self.normalization) sparse_to_dense_processor = PythonSparseToDenseProcessor( sorted_features) state_matrix, state_matrix_presence = sparse_to_dense_processor( samples.states) next_state_matrix, next_state_matrix_presence = sparse_to_dense_processor( samples.next_states) logger.info("Converting to Torch...") actions_one_hot = torch.tensor((np.array(samples.actions).reshape( -1, 1) == np.array(self.ACTIONS)).astype(np.int64)) actions = actions_one_hot.argmax(dim=1, keepdim=True) rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1) action_probabilities = torch.tensor(samples.action_probabilities, dtype=torch.float32).reshape( -1, 1) next_actions_one_hot = torch.tensor( (np.array(samples.next_actions).reshape(-1, 1) == np.array( self.ACTIONS)).astype(np.int64)) logger.info("Converting PA to Torch...") possible_action_strings = np.array( list(itertools.zip_longest(*samples.possible_actions, fillvalue=""))).T possible_actions_mask = torch.zeros( [len(samples.actions), len(self.ACTIONS)]) for i, action in enumerate(self.ACTIONS): possible_actions_mask[:, i] = torch.tensor( np.max(possible_action_strings == action, axis=1).astype(np.int64)) logger.info("Converting PNA to Torch...") possible_next_action_strings = np.array( list( itertools.zip_longest(*samples.possible_next_actions, fillvalue=""))).T possible_next_actions_mask = torch.zeros( [len(samples.next_actions), len(self.ACTIONS)]) for i, action in enumerate(self.ACTIONS): possible_next_actions_mask[:, i] = torch.tensor( np.max(possible_next_action_strings == action, axis=1).astype(np.int64)) terminals = torch.tensor(samples.terminals, dtype=torch.int32).reshape(-1, 1) not_terminal = 1 - terminals logger.info("Converting RT to Torch...") time_diffs = torch.ones([len(samples.states), 1]) logger.info("Preprocessing...") preprocessor = Preprocessor(self.normalization, False) states_ndarray = preprocessor(state_matrix, state_matrix_presence) next_states_ndarray = preprocessor(next_state_matrix, next_state_matrix_presence) logger.info("Batching...") tdps = [] for start in range(0, states_ndarray.shape[0], minibatch_size): end = start + minibatch_size if end > states_ndarray.shape[0]: break tdp = TrainingDataPage( states=states_ndarray[start:end], actions=actions_one_hot[start:end] if one_hot_action else actions[start:end], propensities=action_probabilities[start:end], rewards=rewards[start:end], next_states=next_states_ndarray[start:end], # pyre-fixme[16]: `int` has no attribute `__getitem__`. not_terminal=not_terminal[start:end], next_actions=next_actions_one_hot[start:end], possible_actions_mask=possible_actions_mask[start:end], possible_next_actions_mask=possible_next_actions_mask[ start:end], time_diffs=time_diffs[start:end], ) tdp.set_type( torch.cuda.FloatTensor if use_gpu else torch.FloatTensor) tdps.append(tdp) return tdps