Ejemplo n.º 1
0
 def handle(self, tdp: TrainingDataPage) -> None:
     if not self.trainer.calc_cpe_in_training:
         return
     if isinstance(tdp, TrainingDataPage):
         if isinstance(self.trainer, DQNTrainer):
             # This is required until we get rid of TrainingDataPage
             if self.trainer.maxq_learning:
                 edp = EvaluationDataPage.create_from_training_batch(
                     tdp.as_discrete_maxq_training_batch(), self.trainer)
             else:
                 edp = EvaluationDataPage.create_from_training_batch(
                     tdp.as_discrete_sarsa_training_batch(), self.trainer)
         else:
             edp = EvaluationDataPage.create_from_tdp(tdp, self.trainer)
     elif isinstance(tdp, TrainingBatch):
         if isinstance(self.trainer, SACTrainer):
             # TODO: Implement CPE for continuous algos
             edp = None
         else:
             edp = EvaluationDataPage.create_from_training_batch(
                 tdp, self.trainer)
     if self.evaluation_data is None:
         self.evaluation_data = edp
     else:
         self.evaluation_data = self.evaluation_data.append(edp)
Ejemplo n.º 2
0
 def train_numpy(
     self,
     tdp: TrainingDataPage,
     evaluator: Optional[Evaluator],
 ):
     tdp.states = self._reshape_states(tdp.states)
     tdp.next_states = self._reshape_states(tdp.next_states)
     self.train_numpy(tdp, evaluator)
Ejemplo n.º 3
0
    def preprocess(self, batch) -> TrainingDataPage:
        tdp = super().preprocess(batch)

        sorted_action_features, _ = (
            self.action_preprocessor._sort_features_by_normalization())
        sorted_action_features_str = [str(x) for x in sorted_action_features]
        actions = self.sparse_to_dense_processor(sorted_action_features_str,
                                                 batch["action"])

        not_terminal = torch.from_numpy(
            np.array(batch["next_action"],
                     dtype=np.bool).astype(np.float32)).reshape(-1, 1)
        pnas_mask, possible_next_state_actions = None, None
        pas_mask, possible_state_actions = None, None
        next_actions = None

        return TrainingDataPage(
            mdp_ids=tdp.mdp_ids,
            sequence_numbers=tdp.sequence_numbers,
            states=tdp.states,
            actions=actions,
            propensities=tdp.propensities,
            rewards=tdp.rewards,
            possible_actions_mask=pas_mask,
            next_states=tdp.next_states,
            next_actions=next_actions,
            possible_next_actions_mask=pnas_mask,
            not_terminal=not_terminal,
            time_diffs=tdp.time_diffs,
            possible_actions_state_concat=possible_state_actions,
            possible_next_actions_state_concat=possible_next_state_actions,
        )
Ejemplo n.º 4
0
    def preprocess(self, batch) -> TrainingDataPage:
        tdp = super().preprocess(batch)
        actions = self.read_actions(batch["action"])
        pas_mask = torch.from_numpy(
            np.array(batch["possible_actions"], dtype=np.float32))

        next_actions = self.read_actions(batch["next_action"])
        pnas_mask = np.array(batch["possible_next_actions"], dtype=np.float32)
        not_terminal = torch.from_numpy(
            np.max(pnas_mask, 1).astype(np.float32).reshape(-1, 1)).float()
        pnas_mask = torch.from_numpy(pnas_mask)

        possible_next_state_actions = None
        possible_state_actions = None

        return TrainingDataPage(
            mdp_ids=tdp.mdp_ids,
            sequence_numbers=tdp.sequence_numbers,
            states=tdp.states,
            actions=actions,
            propensities=tdp.propensities,
            rewards=tdp.rewards,
            possible_actions_mask=pas_mask,
            next_states=tdp.next_states,
            next_actions=next_actions,
            possible_next_actions_mask=pnas_mask,
            not_terminal=not_terminal,
            time_diffs=tdp.time_diffs,
            possible_actions_state_concat=possible_state_actions,
            possible_next_actions_state_concat=possible_next_state_actions,
        )
Ejemplo n.º 5
0
def preprocess_batch_for_training(action_names, batch, state_normalization):
    sorted_features, _ = preprocessor_net.sort_features_by_normalization(
        state_normalization)
    sorted_features_str = [str(x) for x in sorted_features]

    state_features_df = pd.DataFrame(batch["state_features"])
    state_features_dense = state_features_df[sorted_features_str].values
    next_state_features_df = pd.DataFrame(batch["next_state_features"])
    next_state_features_dense = next_state_features_df[
        sorted_features_str].values
    actions = read_actions(action_names, batch["action"])
    pnas = np.array(batch["possible_next_actions"], dtype=np.float32)
    rewards = np.array(batch["reward"], dtype=np.float32)
    time_diffs = np.array(batch["time_diff"], dtype=np.int32)
    not_terminals = np.max(pnas, 1).astype(np.bool)
    episode_values = np.array(batch["episode_value"], dtype=np.float32)

    # Add preprocessing steps in PyTorch here

    return TrainingDataPage(
        states=state_features_dense,
        actions=actions,
        rewards=rewards,
        next_states=next_state_features_dense,
        possible_next_actions=pnas,
        episode_values=episode_values,
        not_terminals=not_terminals,
        time_diffs=time_diffs,
    )
Ejemplo n.º 6
0
    def preprocess_samples_discrete(
        self,
        states: List[np.ndarray],
        actions: List[np.ndarray],
        rewards: List[int],
        next_states: List[np.ndarray],
        next_actions: List[np.ndarray],
        terminals: List[bool],
        possible_next_actions: List[np.ndarray],
        minibatch_size: int,
    ) -> List[TrainingDataPage]:
        # Shuffle
        merged = list(
            zip(
                states,
                actions,
                rewards,
                next_states,
                next_actions,
                terminals,
                possible_next_actions,
            ))
        self.np_random.shuffle(merged)
        states, actions, rewards, next_states, next_actions, terminals, possible_next_actions = zip(
            *merged)

        not_terminals = np.logical_not(terminals).reshape(-1, 1)
        time_diffs = torch.ones([len(states), 1], dtype=torch.float32)

        tdps = []
        for start in range(0, len(states), minibatch_size):
            end = start + minibatch_size
            if end > len(states):
                break
            tdps.append(
                TrainingDataPage(
                    states=torch.tensor(states[start:end],
                                        dtype=torch.float32),
                    actions=torch.tensor(actions[start:end],
                                         dtype=torch.float32),
                    propensities=torch.ones([end - start, 1],
                                            dtype=torch.float32),
                    rewards=torch.tensor(rewards[start:end],
                                         dtype=torch.float32).reshape(-1, 1),
                    next_states=torch.tensor(next_states[start:end],
                                             dtype=torch.float32),
                    next_actions=torch.tensor(next_actions[start:end],
                                              dtype=torch.float32),
                    possible_next_actions=torch.tensor(
                        possible_next_actions[start:end], dtype=torch.float32),
                    not_terminals=torch.tensor(not_terminals[start:end].astype(
                        np.float32),
                                               dtype=torch.float32),
                    time_diffs=time_diffs[start:end],
                ))
            tdps[-1].set_type(torch.FloatTensor)
        return tdps
Ejemplo n.º 7
0
    def sample_memories(self, batch_size, model_type):
        """
        Samples transitions from replay memory uniformly at random.

        :param batch_size: Number of sampled transitions to return.
        :param model_type: Model type (discrete, parametric).
        """
        cols = [[], [], [], [], [], [], [], [], []]
        indices = np.random.permutation(len(self.replay_memory))[:batch_size]
        for idx in indices:
            memory = self.replay_memory[idx]
            for col, value in zip(cols, memory):
                col.append(value)

        possible_next_actions_lengths = torch.tensor(cols[7],
                                                     dtype=torch.int32)
        next_states = torch.tensor(cols[3], dtype=torch.float32)

        if model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
            possible_next_actions = []
            for pna_matrix in cols[6]:
                for row in pna_matrix:
                    possible_next_actions.append(row)

            tiled_states = torch.from_numpy(
                np.repeat(next_states.numpy(),
                          possible_next_actions_lengths.numpy(),
                          axis=0))
            possible_next_actions = torch.tensor(possible_next_actions,
                                                 dtype=torch.float32)
            possible_next_actions_state_concat = torch.cat(
                (tiled_states, possible_next_actions), dim=1)
        else:
            if cols[6] is None or cols[6][0] is None:
                possible_next_actions = None
            else:
                possible_next_actions = torch.tensor(cols[6],
                                                     dtype=torch.float32)
            possible_next_actions_state_concat = None

        return TrainingDataPage(
            states=torch.tensor(cols[0], dtype=torch.float32),
            actions=torch.tensor(cols[1], dtype=torch.float32),
            propensities=None,
            rewards=torch.tensor(cols[2], dtype=torch.float32).reshape(-1, 1),
            next_states=torch.tensor(cols[3], dtype=torch.float32),
            next_actions=torch.tensor(cols[4], dtype=torch.float32),
            possible_next_actions=possible_next_actions,
            episode_values=None,
            not_terminals=torch.from_numpy(
                np.logical_not(np.array(cols[5]),
                               dtype=np.bool).astype(np.int32)).reshape(-1, 1),
            time_diffs=torch.tensor(cols[8], dtype=torch.int32).reshape(-1, 1),
            possible_next_actions_lengths=possible_next_actions_lengths,
            possible_next_actions_state_concat=
            possible_next_actions_state_concat,
        )
Ejemplo n.º 8
0
    def preprocess_samples_discrete(
        self,
        states: List[np.ndarray],
        actions: List[np.ndarray],
        rewards: List[int],
        next_states: List[np.ndarray],
        next_actions: List[np.ndarray],
        is_terminals: List[bool],
        possible_next_actions: List[np.ndarray],
        episode_values: List[float],
        minibatch_size: int,
    ) -> List[TrainingDataPage]:
        # Shuffle
        merged = list(
            zip(
                states,
                actions,
                rewards,
                next_states,
                next_actions,
                is_terminals,
                possible_next_actions,
                episode_values,
            ))
        self.np_random.shuffle(merged)
        states, actions, rewards, next_states, next_actions, is_terminals, possible_next_actions, episode_values = zip(
            *merged)

        not_terminals = np.logical_not(is_terminals).reshape(-1, 1)

        tdps = []
        for start in range(0, len(states), minibatch_size):
            end = start + minibatch_size
            if end > len(states):
                break
            tdps.append(
                TrainingDataPage(
                    states=np.array(states[start:end], dtype=np.float32),
                    actions=np.array(actions[start:end], dtype=np.float32),
                    propensities=np.ones([end - start, 1]),
                    rewards=np.array(rewards[start:end],
                                     dtype=np.float32).reshape(-1, 1),
                    next_states=np.array(next_states[start:end],
                                         dtype=np.float32),
                    next_actions=np.array(next_actions[start:end],
                                          dtype=np.float32),
                    possible_next_actions=np.array(
                        possible_next_actions[start:end], dtype=np.float32),
                    episode_values=np.array(episode_values[start:end],
                                            dtype=np.float32).reshape(-1, 1),
                    not_terminals=not_terminals[start:end],
                ))
        return tdps
Ejemplo n.º 9
0
    def get_training_data_page(self, num_samples):
        """
        Returns a TrainingDataPage with shuffled, transformed transitions from
        replay memory.

        :param num_samples: Number of transitions to sample from replay memory.
        """
        states, actions, rewards, next_states, next_actions, terminals,\
            possible_next_actions = self.sample_memories(num_samples)
        return TrainingDataPage(
            np.array(states, dtype=np.float32),
            np.array(actions, dtype=np.float32),
            np.array(rewards, dtype=np.float32),
            np.array(next_states, dtype=np.float32),
            np.array(next_actions, dtype=np.float32),
            np.array(possible_next_actions, dtype=np.float32), None, None,
            np.logical_not(terminals, dtype=np.bool))
Ejemplo n.º 10
0
    def preprocess(self, batch) -> TrainingDataPage:
        # Preprocess state features
        sorted_state_features, _ = (
            self.state_preprocessor._sort_features_by_normalization())
        sorted_state_features_str = [str(x) for x in sorted_state_features]
        state_features_dense = self.sparse_to_dense_processor(
            sorted_state_features_str, batch["state_features"])
        next_state_features_dense = self.sparse_to_dense_processor(
            sorted_state_features_str, batch["next_state_features"])

        state_features_dense = self.state_preprocessor.forward(
            state_features_dense)
        next_state_features_dense = self.state_preprocessor.forward(
            next_state_features_dense)

        mdp_ids = np.array(batch["mdp_id"]).reshape(-1, 1)
        sequence_numbers = torch.tensor(batch["sequence_number"],
                                        dtype=torch.int32).reshape(-1, 1)
        rewards = torch.tensor(batch["reward"],
                               dtype=torch.float32).reshape(-1, 1)
        time_diffs = torch.tensor(batch["time_diff"],
                                  dtype=torch.int32).reshape(-1, 1)
        if "action_probability" in batch:
            propensities = torch.tensor(batch["action_probability"],
                                        dtype=torch.float32).reshape(-1, 1)
        else:
            propensities = torch.ones(rewards.shape, dtype=torch.float32)

        return TrainingDataPage(
            mdp_ids=mdp_ids,
            sequence_numbers=sequence_numbers,
            states=state_features_dense,
            propensities=propensities,
            rewards=rewards,
            next_states=next_state_features_dense,
            time_diffs=time_diffs,
        )
Ejemplo n.º 11
0
    def preprocess_samples(
        self,
        samples: Samples,
        minibatch_size: int,
        use_gpu: bool = False,
        one_hot_action: bool = True,
        normalize_actions: bool = True,
    ) -> List[TrainingDataPage]:
        logger.info("Shuffling...")
        samples.shuffle()

        logger.info("Sparse2Dense...")
        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        sorted_state_features, _ = sort_features_by_normalization(self.normalization)
        state_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_state_features
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states")
        next_state_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_state_features
        )
        sorted_action_features, _ = sort_features_by_normalization(
            self.normalization_action
        )
        saa = StackedAssociativeArray.from_dict_list(samples.actions, "action")
        action_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_action_features
        )
        saa = StackedAssociativeArray.from_dict_list(
            samples.next_actions, "next_action"
        )
        next_action_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_action_features
        )
        action_probabilities = torch.tensor(
            samples.action_probabilities, dtype=torch.float32
        ).reshape(-1, 1)
        rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat: List[List[str]] = []
        for pnas in samples.possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat, "possible_next_actions")

        pnas_lengths = torch.tensor(pnas_lengths_list, dtype=torch.int32)
        pna_lens_blob = "pna_lens_blob"
        workspace.FeedBlob(pna_lens_blob, pnas_lengths.numpy())

        possible_next_actions_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_action_features
        )

        state_pnas_tile_blob = C2.LengthsTile(next_state_matrix, pna_lens_blob)

        workspace.RunNetOnce(net)

        logger.info("Preprocessing...")
        state_preprocessor = Preprocessor(self.normalization, False)
        action_preprocessor = Preprocessor(self.normalization_action, False)

        states_ndarray = workspace.FetchBlob(state_matrix)
        states_ndarray = state_preprocessor.forward(states_ndarray)

        actions_ndarray = torch.from_numpy(workspace.FetchBlob(action_matrix))
        if normalize_actions:
            actions_ndarray = action_preprocessor.forward(actions_ndarray)

        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_states_ndarray = state_preprocessor.forward(next_states_ndarray)

        next_actions_ndarray = torch.from_numpy(workspace.FetchBlob(next_action_matrix))
        if normalize_actions:
            next_actions_ndarray = action_preprocessor.forward(next_actions_ndarray)

        logged_possible_next_actions = action_preprocessor.forward(
            workspace.FetchBlob(possible_next_actions_matrix)
        )

        state_pnas_tile = state_preprocessor.forward(
            workspace.FetchBlob(state_pnas_tile_blob)
        )
        logged_possible_next_state_actions = torch.cat(
            (state_pnas_tile, logged_possible_next_actions), dim=1
        )

        logger.info("Reward Timeline to Torch...")
        possible_next_actions_ndarray = logged_possible_next_actions
        possible_next_actions_state_concat = logged_possible_next_state_actions
        time_diffs = torch.ones([len(samples.states), 1])

        tdps = []
        pnas_start = 0
        logger.info("Batching...")
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + torch.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_concat = possible_next_actions_state_concat[pnas_start:pnas_end]
            pnas_start = pnas_end
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_ndarray[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                next_actions=next_actions_ndarray[start:end],
                possible_next_actions=None,
                not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                time_diffs=time_diffs[start:end],
                possible_next_actions_lengths=pnas_lengths[start:end],
                possible_next_actions_state_concat=pnas_concat,
            )
            tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor)
            tdps.append(tdp)
        return tdps
Ejemplo n.º 12
0
    def preprocess_samples_discrete(
            self, samples: Samples,
            minibatch_size: int) -> List[TrainingDataPage]:
        samples.shuffle()

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
        )
        workspace.RunNetOnce(net)
        actions_one_hot = np.zeros(
            [len(samples.actions), len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(samples.actions):
            actions_one_hot[i, self.action_to_index(action)] = 1
        rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1)
        propensities = np.array(samples.propensities,
                                dtype=np.float32).reshape(-1, 1)
        next_actions_one_hot = np.zeros(
            [len(samples.next_actions),
             len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(samples.next_actions):
            if action == "":
                continue
            next_actions_one_hot[i, self.action_to_index(action)] = 1
        possible_next_actions_mask = []
        for pna in samples.possible_next_actions:
            pna_mask = [0] * self.num_actions
            for action in pna:
                pna_mask[self.action_to_index(action)] = 1
            possible_next_actions_mask.append(pna_mask)
        possible_next_actions_mask = np.array(possible_next_actions_mask,
                                              dtype=np.float32)
        is_terminals = np.array(samples.is_terminal,
                                dtype=np.bool).reshape(-1, 1)
        not_terminals = np.logical_not(is_terminals)
        if samples.reward_timelines is not None:
            reward_timelines = np.array(samples.reward_timelines,
                                        dtype=np.object)

        states_ndarray = workspace.FetchBlob(state_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_one_hot[start:end],
                    propensities=propensities[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    not_terminals=not_terminals[start:end],
                    next_actions=next_actions_one_hot[start:end],
                    possible_next_actions=possible_next_actions_mask[
                        start:end],
                    reward_timelines=reward_timelines[start:end]
                    if reward_timelines is not None else None,
                ))
        return tdps
Ejemplo n.º 13
0
    def preprocess_samples(
        self,
        samples: Samples,
        minibatch_size: int,
        use_gpu: bool = False,
        one_hot_action: bool = True,
        normalize_actions: bool = True,
    ) -> List[TrainingDataPage]:
        logger.info("Shuffling...")
        samples = shuffle_samples(samples)

        logger.info("Sparse2Dense...")
        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        sorted_state_features, _ = sort_features_by_normalization(
            self.normalization)
        sorted_action_features, _ = sort_features_by_normalization(
            self.normalization_action)
        state_sparse_to_dense_processor = Caffe2SparseToDenseProcessor(
            sorted_state_features)
        action_sparse_to_dense_processor = Caffe2SparseToDenseProcessor(
            sorted_action_features)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, state_matrix_presence, _ = state_sparse_to_dense_processor(
            saa)
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, next_state_matrix_presence, _ = state_sparse_to_dense_processor(
            saa)
        saa = StackedAssociativeArray.from_dict_list(  # type: ignore
            samples.actions, "action")
        action_matrix, action_matrix_presence, _ = action_sparse_to_dense_processor(
            saa)
        saa = StackedAssociativeArray.from_dict_list(  # type: ignore
            samples.next_actions, "next_action")
        next_action_matrix, next_action_matrix_presence, _ = action_sparse_to_dense_processor(
            saa)
        action_probabilities = torch.tensor(samples.action_probabilities,
                                            dtype=torch.float32).reshape(
                                                -1, 1)
        rewards = torch.tensor(samples.rewards,
                               dtype=torch.float32).reshape(-1, 1)

        max_action_size = 4

        pnas_mask_list: List[List[int]] = []
        pnas_flat: List[Dict[str, float]] = []
        for pnas in samples.possible_next_actions:
            pnas_mask_list.append([1] * len(pnas) + [0] *
                                  (max_action_size - len(pnas)))
            pnas_flat.extend(pnas)  # type: ignore
            for _ in range(max_action_size - len(pnas)):
                pnas_flat.append({})  # Filler
        saa = StackedAssociativeArray.from_dict_list(  # type: ignore
            pnas_flat, "possible_next_actions")
        pnas_mask = torch.Tensor(pnas_mask_list)

        possible_next_actions_matrix, possible_next_actions_matrix_presence, _ = action_sparse_to_dense_processor(
            saa)

        workspace.RunNetOnce(net)

        logger.info("Preprocessing...")
        state_preprocessor = Preprocessor(self.normalization, False)
        action_preprocessor = Preprocessor(self.normalization_action, False)

        states_ndarray = state_preprocessor(
            torch.from_numpy(workspace.FetchBlob(state_matrix)),
            torch.from_numpy(
                workspace.FetchBlob(state_matrix_presence)).float(),
        )

        if normalize_actions:
            actions_ndarray = action_preprocessor(
                torch.from_numpy(workspace.FetchBlob(action_matrix)),
                torch.from_numpy(
                    workspace.FetchBlob(action_matrix_presence)).float(),
            )
        else:
            actions_ndarray = torch.from_numpy(
                workspace.FetchBlob(action_matrix))

        next_states_ndarray = torch.from_numpy(
            workspace.FetchBlob(next_state_matrix))
        next_states_ndarray = state_preprocessor(
            next_states_ndarray,
            (next_states_ndarray != MISSING_VALUE).float())

        state_pnas_tile = next_states_ndarray.repeat(
            1, max_action_size).reshape(-1, next_states_ndarray.shape[1])

        if normalize_actions:
            next_actions_ndarray = action_preprocessor(
                torch.from_numpy(workspace.FetchBlob(next_action_matrix)),
                torch.from_numpy(
                    workspace.FetchBlob(next_action_matrix_presence)).float(),
            )
        else:
            next_actions_ndarray = torch.from_numpy(
                workspace.FetchBlob(next_action_matrix))

        if normalize_actions:
            logged_possible_next_actions = action_preprocessor(
                torch.from_numpy(
                    workspace.FetchBlob(possible_next_actions_matrix)),
                torch.from_numpy(
                    workspace.FetchBlob(
                        possible_next_actions_matrix_presence)).float(),
            )
        else:
            logged_possible_next_actions = torch.from_numpy(
                workspace.FetchBlob(possible_next_actions_matrix))

        assert state_pnas_tile.shape[0] == logged_possible_next_actions.shape[
            0], ("Invalid shapes: " + str(state_pnas_tile.shape) + " != " +
                 str(logged_possible_next_actions.shape))
        logged_possible_next_state_actions = torch.cat(
            (state_pnas_tile, logged_possible_next_actions), dim=1)

        logger.info("Reward Timeline to Torch...")
        time_diffs = torch.ones([len(samples.states), 1])

        tdps = []
        pnas_start = 0
        logger.info("Batching...")
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + (minibatch_size * max_action_size)
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_ndarray[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                next_actions=next_actions_ndarray[start:end],
                not_terminal=(pnas_mask[start:end, :].sum(dim=1, keepdim=True)
                              > 0),
                time_diffs=time_diffs[start:end],
                possible_next_actions_mask=pnas_mask[start:end, :],
                possible_next_actions_state_concat=
                logged_possible_next_state_actions[pnas_start:pnas_end, :],
            )
            pnas_start = pnas_end
            tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.
                         FloatTensor  # type: ignore
                         )
            tdps.append(tdp)
        return tdps
Ejemplo n.º 14
0
    def preprocess_samples_discrete(
        self,
        samples: Samples,
        minibatch_size: int,
        one_hot_action: bool = True,
        use_gpu: bool = False,
        do_shuffle: bool = True,
    ) -> List[TrainingDataPage]:

        if do_shuffle:
            logger.info("Shuffling...")
            samples = shuffle_samples(samples)

        logger.info("Preprocessing...")
        sparse_to_dense_processor = Caffe2SparseToDenseProcessor()

        if self.sparse_to_dense_net is None:
            self.sparse_to_dense_net = core.Net("gridworld_sparse_to_dense")
            C2.set_net(self.sparse_to_dense_net)
            saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
            sorted_features, _ = sort_features_by_normalization(self.normalization)
            self.state_matrix, _ = sparse_to_dense_processor(sorted_features, saa)
            saa = StackedAssociativeArray.from_dict_list(
                samples.next_states, "next_states"
            )
            self.next_state_matrix, _ = sparse_to_dense_processor(sorted_features, saa)
            C2.set_net(None)
        else:
            StackedAssociativeArray.from_dict_list(samples.states, "states")
            StackedAssociativeArray.from_dict_list(samples.next_states, "next_states")
        workspace.RunNetOnce(self.sparse_to_dense_net)

        logger.info("Converting to Torch...")
        actions_one_hot = torch.tensor(
            (np.array(samples.actions).reshape(-1, 1) == np.array(self.ACTIONS)).astype(
                np.int64
            )
        )
        actions = actions_one_hot.argmax(dim=1, keepdim=True)
        rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1)
        action_probabilities = torch.tensor(
            samples.action_probabilities, dtype=torch.float32
        ).reshape(-1, 1)
        next_actions_one_hot = torch.tensor(
            (
                np.array(samples.next_actions).reshape(-1, 1) == np.array(self.ACTIONS)
            ).astype(np.int64)
        )
        logger.info("Converting PA to Torch...")
        possible_action_strings = np.array(
            list(itertools.zip_longest(*samples.possible_actions, fillvalue=""))
        ).T
        possible_actions_mask = torch.zeros([len(samples.actions), len(self.ACTIONS)])
        for i, action in enumerate(self.ACTIONS):
            possible_actions_mask[:, i] = torch.tensor(
                np.max(possible_action_strings == action, axis=1).astype(np.int64)
            )
        logger.info("Converting PNA to Torch...")
        possible_next_action_strings = np.array(
            list(itertools.zip_longest(*samples.possible_next_actions, fillvalue=""))
        ).T
        possible_next_actions_mask = torch.zeros(
            [len(samples.next_actions), len(self.ACTIONS)]
        )
        for i, action in enumerate(self.ACTIONS):
            possible_next_actions_mask[:, i] = torch.tensor(
                np.max(possible_next_action_strings == action, axis=1).astype(np.int64)
            )
        terminals = torch.tensor(samples.terminals, dtype=torch.int32).reshape(-1, 1)
        not_terminal = 1 - terminals
        logger.info("Converting RT to Torch...")

        time_diffs = torch.ones([len(samples.states), 1])

        logger.info("Preprocessing...")
        preprocessor = Preprocessor(self.normalization, False)

        states_ndarray = workspace.FetchBlob(self.state_matrix)
        states_ndarray = preprocessor.forward(states_ndarray)

        next_states_ndarray = workspace.FetchBlob(self.next_state_matrix)
        next_states_ndarray = preprocessor.forward(next_states_ndarray)

        logger.info("Batching...")
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_one_hot[start:end]
                if one_hot_action
                else actions[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                not_terminal=not_terminal[start:end],
                next_actions=next_actions_one_hot[start:end],
                possible_actions_mask=possible_actions_mask[start:end],
                possible_next_actions_mask=possible_next_actions_mask[start:end],
                time_diffs=time_diffs[start:end],
            )
            tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor)
            tdps.append(tdp)
        return tdps
Ejemplo n.º 15
0
    def preprocess_samples(self, samples: Samples,
                           minibatch_size: int) -> List[TrainingDataPage]:
        samples.shuffle()

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.actions, "action")
        action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "action_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_actions,
                                                     "next_action")
        next_action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "next_action_norm",
            False,
            False,
        )
        propensities = np.array(samples.propensities,
                                dtype=np.float32).reshape(-1, 1)
        rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat: List[List[str]] = []
        for pnas in samples.possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat,
                                                     "possible_next_actions")
        pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32)
        possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "possible_next_action_norm",
            False,
            False,
        )
        workspace.RunNetOnce(net)

        states_ndarray = workspace.FetchBlob(state_matrix)
        actions_ndarray = workspace.FetchBlob(action_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_actions_ndarray = workspace.FetchBlob(next_action_matrix)
        possible_next_actions_ndarray = workspace.FetchBlob(
            possible_next_actions_matrix)
        tdps = []
        pnas_start = 0
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + np.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_start = pnas_end
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_ndarray[start:end],
                    propensities=propensities[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    next_actions=next_actions_ndarray[start:end],
                    possible_next_actions=StackedArray(pnas_lengths[start:end],
                                                       pnas),
                    not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                    reward_timelines=samples.reward_timelines[start:end]
                    if samples.reward_timelines else None,
                ))
        return tdps
Ejemplo n.º 16
0
    def train(
        self, training_samples: TrainingDataPage, evaluator: Optional[Evaluator] = None
    ) -> None:
        training_samples.set_type(self.dtype)

        if self.minibatch == 0:
            # Assume that the tensors are the right shape after the first minibatch
            assert training_samples.states.shape[0] == self.minibatch_size, (
                "Invalid shape: " + str(training_samples.states.shape)
            )
            assert training_samples.actions.shape == torch.Size(
                [self.minibatch_size, len(self._actions)]
            ), "Invalid shape: " + str(training_samples.actions.shape)
            assert training_samples.rewards.shape == torch.Size(
                [self.minibatch_size, 1]
            ), "Invalid shape: " + str(training_samples.rewards.shape)
            assert (
                training_samples.episode_values is None
                or training_samples.episode_values.shape
                == training_samples.rewards.shape
            ), (
                "Invalid shape: " + str(training_samples.episode_values.shape)
            )
            assert (
                training_samples.next_states.shape == training_samples.states.shape
            ), (
                "Invalid shape: " + str(training_samples.next_states.shape)
            )
            assert (
                training_samples.not_terminals.shape == training_samples.rewards.shape
            ), (
                "Invalid shape: " + str(training_samples.not_terminals.shape)
            )
            if training_samples.possible_next_actions is not None:
                assert (
                    training_samples.possible_next_actions.shape
                    == training_samples.actions.shape
                ), (
                    "Invalid shape: "
                    + str(training_samples.possible_next_actions.shape)
                )
            if training_samples.propensities is not None:
                assert (
                    training_samples.propensities.shape
                    == training_samples.rewards.shape
                ), (
                    "Invalid shape: " + str(training_samples.propensities.shape)
                )

        # Apply reward boost if specified
        reward_boosts = torch.sum(
            training_samples.actions * self.reward_boosts, dim=1, keepdim=True
        )
        boosted_rewards = training_samples.rewards + reward_boosts

        self.minibatch += 1
        states = training_samples.states.detach().requires_grad_(True)
        actions = training_samples.actions
        rewards = boosted_rewards
        next_states = training_samples.next_states
        discount_tensor = torch.full(
            training_samples.time_diffs.shape, self.gamma
        ).type(self.dtype)
        not_done_mask = training_samples.not_terminals

        if self.use_seq_num_diff_as_time_diff:
            discount_tensor = discount_tensor.pow(training_samples.time_diffs)

        if self.maxq_learning:
            # Compute max a' Q(s', a') over all possible actions using target network
            possible_next_actions = training_samples.possible_next_actions
            next_q_values = self.get_max_q_values(
                next_states, possible_next_actions, self.double_q_learning
            )
        else:
            # SARSA
            next_actions = training_samples.next_actions
            next_q_values = self.get_next_action_q_values(next_states, next_actions)

        filtered_next_q_vals = next_q_values * not_done_mask

        if self.use_reward_burnin and self.minibatch < self.reward_burnin:
            target_q_values = rewards
        else:
            target_q_values = rewards + (discount_tensor * filtered_next_q_vals)

        # Get Q-value of action taken
        all_q_values = self.q_network(states)
        self.all_action_scores = deepcopy(all_q_values.detach())
        q_values = torch.sum(all_q_values * actions, 1, keepdim=True)

        logger.info(q_values.shape)
        logger.info(target_q_values.shape)
        logger.info(rewards.shape)
        logger.info(next_q_values.shape)
        loss = self.q_network_loss(q_values, target_q_values)
        self.loss = loss.detach()

        self.q_network_optimizer.zero_grad()
        loss.backward()
        if self.gradient_handler:
            self.gradient_handler(self.q_network.parameters())
        self.q_network_optimizer.step()

        if self.use_reward_burnin and self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network, self.q_network_target, 1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network, self.q_network_target, self.tau)

        # get reward estimates
        reward_estimates = self.reward_network(states)
        self.reward_estimates = reward_estimates.detach()
        reward_estimates_for_logged_actions = reward_estimates.gather(
            1, actions.argmax(dim=1, keepdim=True)
        )
        reward_loss = F.mse_loss(reward_estimates_for_logged_actions, rewards)
        self.reward_network_optimizer.zero_grad()
        reward_loss.backward()
        self.reward_network_optimizer.step()

        if evaluator is not None:
            self.evaluate(
                evaluator,
                training_samples.actions,
                training_samples.propensities,
                boosted_rewards,
                training_samples.episode_values,
            )
Ejemplo n.º 17
0
    def preprocess_samples(self, samples: Samples,
                           minibatch_size: int) -> List[TrainingDataPage]:
        samples.shuffle()

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.actions, "action")
        action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "action_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_actions,
                                                     "next_action")
        next_action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "next_action_norm",
            False,
            False,
            False,
        )
        propensities = np.array(samples.propensities,
                                dtype=np.float32).reshape(-1, 1)
        rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat: List[List[str]] = []
        for pnas in samples.possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat,
                                                     "possible_next_actions")

        pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32)
        pna_lens_blob = "pna_lens_blob"
        workspace.FeedBlob(pna_lens_blob, pnas_lengths)

        possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "possible_next_action_norm",
            False,
            False,
            False,
        )

        state_pnas_tile_blob = C2.LengthsTile(next_state_matrix, pna_lens_blob)

        workspace.RunNetOnce(net)

        state_preprocessor = Preprocessor(self.normalization, False)
        action_preprocessor = Preprocessor(self.normalization_action, False)

        states_ndarray = workspace.FetchBlob(state_matrix)
        states_ndarray = state_preprocessor.forward(states_ndarray).numpy()

        actions_ndarray = workspace.FetchBlob(action_matrix)
        actions_ndarray = action_preprocessor.forward(actions_ndarray).numpy()

        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_states_ndarray = state_preprocessor.forward(
            next_states_ndarray).numpy()

        next_actions_ndarray = workspace.FetchBlob(next_action_matrix)
        next_actions_ndarray = action_preprocessor.forward(
            next_actions_ndarray).numpy()

        logged_possible_next_actions = action_preprocessor.forward(
            workspace.FetchBlob(possible_next_actions_matrix))

        state_pnas_tile = state_preprocessor.forward(
            workspace.FetchBlob(state_pnas_tile_blob))
        logged_possible_next_state_actions = torch.cat(
            (state_pnas_tile, logged_possible_next_actions), dim=1)

        possible_next_actions_ndarray = logged_possible_next_actions.cpu(
        ).numpy()
        next_state_pnas_concat = logged_possible_next_state_actions.cpu(
        ).numpy()
        time_diffs = np.ones(len(states_ndarray))
        episode_values = None
        if samples.reward_timelines is not None:
            episode_values = np.zeros(rewards.shape, dtype=np.float32)
            for i, reward_timeline in enumerate(samples.reward_timelines):
                for time_diff, reward in reward_timeline.items():
                    episode_values[i, 0] += reward * (DISCOUNT**time_diff)

        tdps = []
        pnas_start = 0
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + np.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_concat = next_state_pnas_concat[pnas_start:pnas_end]
            pnas_start = pnas_end
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_ndarray[start:end],
                    propensities=propensities[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    next_actions=next_actions_ndarray[start:end],
                    possible_next_actions=StackedArray(pnas_lengths[start:end],
                                                       pnas),
                    not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                    episode_values=episode_values[start:end]
                    if episode_values is not None else None,
                    time_diffs=time_diffs[start:end],
                    possible_next_actions_lengths=pnas_lengths[start:end],
                    next_state_pnas_concat=pnas_concat,
                ))
        return tdps
Ejemplo n.º 18
0
    def train(
        self, training_samples: TrainingDataPage, evaluator=None, episode_values=None
    ) -> None:
        training_samples.set_type(self.dtype)

        if self.minibatch == 0:
            # Assume that the tensors are the right shape after the first minibatch
            assert training_samples.states.shape[0] == self.minibatch_size, (
                "Invalid shape: " + str(training_samples.states.shape)
            )
            assert training_samples.actions.shape[0] == self.minibatch_size, (
                "Invalid shape: " + str(training_samples.actions.shape)
            )
            assert training_samples.rewards.shape == torch.Size(
                [self.minibatch_size, 1]
            ), "Invalid shape: " + str(training_samples.rewards.shape)
            assert (
                training_samples.episode_values is None
                or training_samples.episode_values.shape
                == training_samples.rewards.shape
            ), (
                "Invalid shape: " + str(training_samples.episode_values.shape)
            )
            assert (
                training_samples.next_states.shape == training_samples.states.shape
            ), (
                "Invalid shape: " + str(training_samples.next_states.shape)
            )
            assert (
                training_samples.not_terminals.shape == training_samples.rewards.shape
            ), (
                "Invalid shape: " + str(training_samples.not_terminals.shape)
            )
            if self.use_seq_num_diff_as_time_diff:
                assert (
                    training_samples.time_diffs.shape == training_samples.rewards.shape
                ), (
                    "Invalid shape: " + str(training_samples.time_diffs.shape)
                )

        self.minibatch += 1
        states = training_samples.states.detach().requires_grad_(True)
        actions = training_samples.actions.detach().requires_grad_(True)

        # As far as ddpg is concerned all actions are [-1, 1] due to actor tanh
        actions = rescale_torch_tensor(
            actions,
            new_min=self.min_action_range_tensor_training,
            new_max=self.max_action_range_tensor_training,
            prev_min=self.min_action_range_tensor_serving,
            prev_max=self.max_action_range_tensor_serving,
        )
        rewards = training_samples.rewards
        next_states = torch.tensor(training_samples.next_states, requires_grad=True)
        time_diffs = training_samples.time_diffs
        discount_tensor = torch.tensor(np.full(rewards.shape, self.gamma)).type(
            self.dtype
        )
        not_done_mask = training_samples.not_terminals

        # Optimize the critic network subject to mean squared error:
        # L = ([r + gamma * Q(s2, a2)] - Q(s1, a1)) ^ 2
        q_s1_a1 = self.critic(torch.cat((states, actions), dim=1))
        next_actions = self.actor_target(next_states)

        next_state_actions = torch.cat((next_states, next_actions), dim=1)
        q_s2_a2 = self.critic_target(next_state_actions)
        filtered_q_s2_a2 = not_done_mask * q_s2_a2

        if self.use_seq_num_diff_as_time_diff:
            discount_tensor = discount_tensor.pow(time_diffs)

        if self.use_reward_burnin and self.minibatch < self.reward_burnin:
            target_q_values = rewards
        else:
            target_q_values = rewards + (discount_tensor * filtered_q_s2_a2)

        # compute loss and update the critic network
        critic_predictions = q_s1_a1
        loss_critic = self.q_network_loss(critic_predictions, target_q_values)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        # Optimize the actor network subject to the following:
        # max sum(Q(s1, a1)) or min -sum(Q(s1, a1))
        loss_actor = -self.critic(torch.cat((states, self.actor(states)), dim=1)).sum()
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()

        if self.use_reward_burnin and self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.actor, self.actor_target, 1.0)
            self._soft_update(self.critic, self.critic_target, 1.0)
        else:
            # Use the soft update rule to update both target networks
            self._soft_update(self.actor, self.actor_target, self.tau)
            self._soft_update(self.critic, self.critic_target, self.tau)

        if evaluator is not None:
            evaluator.report(
                loss_critic.cpu().data.numpy(),
                None,
                None,
                None,
                episode_values,
                None,
                None,
                None,
                critic_predictions.cpu().data.numpy(),
                None,
            )
Ejemplo n.º 19
0
    def preprocess_samples_discrete(
        self,
        states: List[Dict[int, float]],
        actions: List[str],
        rewards: List[float],
        next_states: List[Dict[int, float]],
        next_actions: List[str],
        is_terminals: List[bool],
        possible_next_actions: List[List[str]],
        reward_timelines: Optional[List[Dict[int, float]]],
        minibatch_size: int,
    ) -> List[TrainingDataPage]:
        # Shuffle
        if reward_timelines is None:
            merged = list(
                zip(states, actions, rewards, next_states, next_actions,
                    is_terminals, possible_next_actions))
            random.shuffle(merged)
            states, actions, rewards, next_states, next_actions, \
                is_terminals, possible_next_actions = zip(*merged)
        else:
            merged = list(
                zip(states, actions, rewards, next_states, next_actions,
                    is_terminals, possible_next_actions, reward_timelines))
            random.shuffle(merged)
            states, actions, rewards, next_states, next_actions, \
                is_terminals, possible_next_actions, reward_timelines = zip(*merged)

        net = core.Net('gridworld_preprocessing')
        C2.set_net(net)
        preprocessor = PreprocessorNet(net, True)
        saa = StackedAssociativeArray.from_dict_list(states, 'states')
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            'state_norm',
        )
        saa = StackedAssociativeArray.from_dict_list(next_states,
                                                     'next_states')
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            'next_state_norm',
        )
        workspace.RunNetOnce(net)
        actions_one_hot = np.zeros(
            [len(actions), len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(actions):
            actions_one_hot[i, self.ACTIONS.index(action)] = 1
        rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1)
        next_actions_one_hot = np.zeros(
            [len(next_actions), len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(next_actions):
            if action == '':
                continue
            next_actions_one_hot[i, self.ACTIONS.index(action)] = 1
        possible_next_actions_mask = []
        for pna in possible_next_actions:
            pna_mask = [0] * self.num_actions
            for action in pna:
                pna_mask[self.ACTIONS.index(action)] = 1
            possible_next_actions_mask.append(pna_mask)
        possible_next_actions_mask = np.array(possible_next_actions_mask,
                                              dtype=np.float32)
        is_terminals = np.array(is_terminals, dtype=np.bool).reshape(-1, 1)
        not_terminals = np.logical_not(is_terminals)
        if reward_timelines is not None:
            reward_timelines = np.array(reward_timelines, dtype=np.object)

        states_ndarray = workspace.FetchBlob(state_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_one_hot[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    not_terminals=not_terminals[start:end],
                    next_actions=next_actions_one_hot[start:end],
                    possible_next_actions=possible_next_actions_mask[
                        start:end],
                    reward_timelines=reward_timelines[start:end]
                    if reward_timelines is not None else None,
                ))
        return tdps
Ejemplo n.º 20
0
    def preprocess_samples(
        self,
        samples: Samples,
        minibatch_size: int,
        use_gpu: bool = False,
        one_hot_action: bool = True,
        normalize_actions: bool = True,
    ) -> List[TrainingDataPage]:
        logger.info("Shuffling...")
        samples = shuffle_samples(samples)

        logger.info("Sparse2Dense...")
        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        sorted_state_features, _ = sort_features_by_normalization(self.normalization)
        state_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_state_features
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states, "next_states")
        next_state_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_state_features
        )
        sorted_action_features, _ = sort_features_by_normalization(
            self.normalization_action
        )
        saa = StackedAssociativeArray.from_dict_list(samples.actions, "action")
        action_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_action_features
        )
        saa = StackedAssociativeArray.from_dict_list(
            samples.next_actions, "next_action"
        )
        next_action_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_action_features
        )
        action_probabilities = torch.tensor(
            samples.action_probabilities, dtype=torch.float32
        ).reshape(-1, 1)
        rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1)

        max_action_size = 4

        pnas_mask_list: List[List[int]] = []
        pnas_flat: List[Dict[str, float]] = []
        for pnas in samples.possible_next_actions:
            pnas_mask_list.append([1] * len(pnas) + [0] * (max_action_size - len(pnas)))
            pnas_flat.extend(pnas)
            for _ in range(max_action_size - len(pnas)):
                pnas_flat.append({})  # Filler
        saa = StackedAssociativeArray.from_dict_list(pnas_flat, "possible_next_actions")
        pnas_mask = torch.Tensor(pnas_mask_list)

        possible_next_actions_matrix, _ = sparse_to_dense(
            saa.lengths, saa.keys, saa.values, sorted_action_features
        )

        workspace.RunNetOnce(net)

        logger.info("Preprocessing...")
        state_preprocessor = Preprocessor(self.normalization, False)
        action_preprocessor = Preprocessor(self.normalization_action, False)

        states_ndarray = workspace.FetchBlob(state_matrix)
        states_ndarray = state_preprocessor.forward(states_ndarray)

        actions_ndarray = torch.from_numpy(workspace.FetchBlob(action_matrix))
        if normalize_actions:
            actions_ndarray = action_preprocessor.forward(actions_ndarray)

        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_states_ndarray = state_preprocessor.forward(next_states_ndarray)

        state_pnas_tile = next_states_ndarray.repeat(1, max_action_size).reshape(
            -1, next_states_ndarray.shape[1]
        )

        next_actions_ndarray = torch.from_numpy(workspace.FetchBlob(next_action_matrix))
        if normalize_actions:
            next_actions_ndarray = action_preprocessor.forward(next_actions_ndarray)

        logged_possible_next_actions = action_preprocessor.forward(
            workspace.FetchBlob(possible_next_actions_matrix)
        )

        assert state_pnas_tile.shape[0] == logged_possible_next_actions.shape[0], (
            "Invalid shapes: "
            + str(state_pnas_tile.shape)
            + " != "
            + str(logged_possible_next_actions.shape)
        )
        logged_possible_next_state_actions = torch.cat(
            (state_pnas_tile, logged_possible_next_actions), dim=1
        )

        logger.info("Reward Timeline to Torch...")
        time_diffs = torch.ones([len(samples.states), 1])

        tdps = []
        pnas_start = 0
        logger.info("Batching...")
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + (minibatch_size * max_action_size)
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_ndarray[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                next_actions=next_actions_ndarray[start:end],
                not_terminal=(pnas_mask[start:end, :].sum(dim=1, keepdim=True) > 0),
                time_diffs=time_diffs[start:end],
                possible_next_actions_mask=pnas_mask[start:end, :],
                possible_next_actions_state_concat=logged_possible_next_state_actions[
                    pnas_start:pnas_end, :
                ],
            )
            pnas_start = pnas_end
            tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor)
            tdps.append(tdp)
        return tdps
Ejemplo n.º 21
0
def preprocess_batch_for_training(
    state_preprocessor, batch, action_names=None, action_preprocessor=None
):

    assert (action_names is None) ^ (
        action_preprocessor is None
    ), "Either action_names should be None xor action_preprocessor should be None"

    # Preprocess state features
    sorted_state_features, _ = state_preprocessor._sort_features_by_normalization()
    sorted_state_features_str = [str(x) for x in sorted_state_features]
    state_features_dense = pandas_sparse_to_dense(
        sorted_state_features_str, batch["state_features"]
    )
    next_state_features_dense = pandas_sparse_to_dense(
        sorted_state_features_str, batch["next_state_features"]
    )

    state_features_dense = state_preprocessor.forward(state_features_dense)
    next_state_features_dense = state_preprocessor.forward(next_state_features_dense)

    mdp_ids = np.array(batch["mdp_id"]).reshape(-1, 1)
    sequence_numbers = torch.tensor(
        batch["sequence_number"], dtype=torch.int32
    ).reshape(-1, 1)
    rewards = torch.tensor(batch["reward"], dtype=torch.float32).reshape(-1, 1)
    time_diffs = torch.tensor(batch["time_diff"], dtype=torch.int32).reshape(-1, 1)

    if action_preprocessor:
        # Preprocess action features for parametric action DQN
        sorted_action_features, _ = (
            action_preprocessor._sort_features_by_normalization()
        )
        sorted_action_features_str = [str(x) for x in sorted_action_features]
        actions = pandas_sparse_to_dense(sorted_action_features_str, batch["action"])

        if "possible_next_actions" not in batch.keys():
            # DDPG / SAC
            not_terminal = torch.from_numpy(
                np.array(batch["next_action"], dtype=np.bool).astype(np.float32)
            ).reshape(-1, 1)
            pnas, pnas_mask, possible_next_state_actions = None, None, None
            pas, pas_mask, possible_state_actions = None, None, None
            next_actions = None
        else:
            # Parametric DQN
            actions = action_preprocessor.forward(actions)
            next_actions = pandas_sparse_to_dense(
                sorted_action_features_str, batch["next_action"]
            )
            next_actions = action_preprocessor.forward(next_actions)

            max_action_size = max(len(pna) for pna in batch["possible_next_actions"])

            pas_mask = torch.Tensor(
                [
                    ([1] * len(l) + [0] * (max_action_size - len(l)))
                    for l in batch["possible_actions"]
                ]
            )
            flat_pas = []
            for pa in enumerate(batch["possible_actions"]):
                for single_pa in enumerate(pa):
                    flat_pas.append(single_pa)
                for _ in range(max_action_size - len(pa)):
                    flat_pas.append({})

            pnas_mask = torch.Tensor(
                [
                    ([1] * len(l) + [0] * (max_action_size - len(l)))
                    for l in batch["possible_next_actions"]
                ]
            )
            flat_pnas = []
            for pna in enumerate(batch["possible_next_actions"]):
                for single_pna in enumerate(pna):
                    flat_pnas.append(single_pna)
                for _ in range(max_action_size - len(pna)):
                    flat_pnas.append({})

            not_terminal = torch.from_numpy(
                np.array(
                    [len(pna) > 0 for pna in batch["possible_next_actions"]]
                ).astype(np.float32)
            ).reshape(-1, 1)
            pnas = pandas_sparse_to_dense(sorted_action_features_str, flat_pnas)
            pnas = action_preprocessor.forward(pnas)
            tiled_next_state_features_dense = next_state_features_dense.repeat(
                1, max_action_size
            ).reshape(-1, next_state_features_dense.shape[1])

            possible_next_state_actions = torch.cat(
                (tiled_next_state_features_dense, pnas.cpu()), dim=1
            )

            pas_mask = torch.Tensor(
                [
                    ([1] * len(l) + [0] * (max_action_size - len(l)))
                    for l in batch["possible_actions"]
                ]
            )
            flat_pas = []
            for pa in batch["possible_actions"]:
                flat_pas.extend(pa)
                for _ in range(max_action_size - len(pa)):
                    flat_pas.append({})
            pas = pandas_sparse_to_dense(sorted_action_features_str, flat_pas)
            pas = action_preprocessor.forward(pas)

            tiled_state_features_dense = state_features_dense.repeat(
                1, max_action_size
            ).reshape(-1, state_features_dense.shape[1])

            possible_state_actions = torch.cat(
                (tiled_state_features_dense, pas.cpu()), dim=1
            )
    else:
        actions = read_actions(action_names, batch["action"])
        next_actions = read_actions(action_names, batch["next_action"])

        pas_mask = torch.from_numpy(
            np.array(batch["possible_next_actions"], dtype=np.float32)
        )

        pnas_mask = np.array(batch["possible_next_actions"], dtype=np.float32)
        not_terminal = np.max(pnas_mask, 1).astype(np.float32).reshape(-1, 1)
        pnas_mask = torch.from_numpy(pnas_mask)
        not_terminal = torch.from_numpy(not_terminal)
        pnas, possible_next_state_actions = None, None
        pas, possible_state_actions = None, None

    if "action_probability" in batch:
        propensities = torch.tensor(
            batch["action_probability"], dtype=torch.float32
        ).reshape(-1, 1)
    else:
        propensities = torch.ones(rewards.shape, dtype=torch.float32)

    return TrainingDataPage(
        mdp_ids=mdp_ids,
        sequence_numbers=sequence_numbers,
        states=state_features_dense,
        actions=actions,
        propensities=propensities,
        rewards=rewards,
        possible_actions_mask=pas_mask,
        next_states=next_state_features_dense,
        next_actions=next_actions,
        possible_next_actions_mask=pnas_mask,
        not_terminal=not_terminal,
        time_diffs=time_diffs,
        possible_actions_state_concat=possible_state_actions,
        possible_next_actions_state_concat=possible_next_state_actions,
    )
Ejemplo n.º 22
0
    def preprocess_samples(
        self,
        states: List[Dict[int, float]],
        actions: List[Dict[int, float]],
        rewards: List[float],
        next_states: List[Dict[int, float]],
        next_actions: List[Dict[int, float]],
        is_terminals: List[bool],
        possible_next_actions: List[List[Dict[int, float]]],
        reward_timelines: List[Dict[int, float]],
        minibatch_size: int,
    ) -> List[TrainingDataPage]:
        # Shuffle
        merged = list(
            zip(states, actions, rewards, next_states, next_actions,
                is_terminals, possible_next_actions, reward_timelines))
        random.shuffle(merged)
        states, actions, rewards, next_states, next_actions, is_terminals, \
            possible_next_actions, reward_timelines = zip(*merged)

        net = core.Net('gridworld_preprocessing')
        C2.set_net(net)
        preprocessor = PreprocessorNet(net, True)
        saa = StackedAssociativeArray.from_dict_list(states, 'states')
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            'state_norm',
        )
        saa = StackedAssociativeArray.from_dict_list(next_states,
                                                     'next_states')
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            'next_state_norm',
        )
        saa = StackedAssociativeArray.from_dict_list(actions, 'action')
        action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            'action_norm',
        )
        saa = StackedAssociativeArray.from_dict_list(next_actions,
                                                     'next_action')
        next_action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            'next_action_norm',
        )
        rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat = []
        for pnas in possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat,
                                                     'possible_next_actions')
        pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32)
        possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            'possible_next_action_norm',
        )
        workspace.RunNetOnce(net)

        states_ndarray = workspace.FetchBlob(state_matrix)
        actions_ndarray = workspace.FetchBlob(action_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_actions_ndarray = workspace.FetchBlob(next_action_matrix)
        possible_next_actions_ndarray = workspace.FetchBlob(
            possible_next_actions_matrix)
        tdps = []
        pnas_start = 0
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + np.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_start = pnas_end
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_ndarray[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    next_actions=next_actions_ndarray[start:end],
                    possible_next_actions=StackedArray(pnas_lengths[start:end],
                                                       pnas),
                    not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                    reward_timelines=reward_timelines[start:end]
                    if reward_timelines else None,
                ))
        return tdps
Ejemplo n.º 23
0
    def preprocess_samples(
        self,
        states: List[Dict[str, float]],
        actions: List[Dict[str, float]],
        rewards: List[float],
        next_states: List[Dict[str, float]],
        next_actions: List[Dict[str, float]],
        is_terminals: List[bool],
        possible_next_actions: List[List[Dict[str, float]]],
        reward_timelines: List[Dict[int, float]],
    ) -> TrainingDataPage:
        # Shuffle
        merged = list(
            zip(states, actions, rewards, next_states, next_actions,
                is_terminals, possible_next_actions, reward_timelines))
        random.shuffle(merged)
        states, actions, rewards, next_states, next_actions, is_terminals, \
            possible_next_actions, reward_timelines = zip(*merged)

        x = []
        for state in states:
            a = [0.0] * self.num_states
            a[int(list(state.keys())[0])] = float(list(state.values())[0])
            x.append(a)
        states = np.array(x, dtype=np.float32)
        x = []
        for state in next_states:
            a = [0.0] * self.num_states
            a[int(list(state.keys())[0])] = float(list(state.values())[0])
            x.append(a)
        next_states = np.array(x, dtype=np.float32)
        x = []
        for action in actions:
            a = [0.0] * self.num_actions
            if len(action) > 0:
                a[int(list(action.keys())[0]) - self.num_states] = \
                    float(list(action.values())[0])
            x.append(a)
        actions = np.array(x, dtype=np.float32)
        x = []
        for action in next_actions:
            a = [0.0] * self.num_actions
            if len(action) > 0:
                a[int(list(action.keys())[0]) - self.num_states] = \
                    float(list(action.values())[0])
            x.append(a)
        next_actions = np.array(x, dtype=np.float32)
        rewards = np.array(rewards, dtype=np.float32)

        continuous_possible_next_actions = []
        for pnas in possible_next_actions:
            pna = []
            for action in pnas:
                a = [0.0] * self.num_actions
                if len(action) > 0:
                    a[int(list(action.keys())[0]) - self.num_states] = \
                        float(list(action.values())[0])
                pna.append(a)
            continuous_possible_next_actions.append(
                np.array(pna, dtype=np.float32))
        continuous_possible_next_actions = np.array(
            continuous_possible_next_actions, dtype=np.object)

        return TrainingDataPage(
            states=states,
            actions=actions,
            rewards=rewards,
            next_states=next_states,
            next_actions=next_actions,
            possible_next_actions=continuous_possible_next_actions,
            reward_timelines=reward_timelines,
            ds=[datetime.date.today().strftime('%Y-%m-%d')] * len(rewards))
Ejemplo n.º 24
0
    def sample_memories(self, batch_size, model_type, chunk=None):
        """
        Samples transitions from replay memory uniformly at random by default
        or pass chunk for deterministic sample.

        :param batch_size: Number of sampled transitions to return.
        :param model_type: Model type (discrete, parametric).
        :param chunk: Index of chunk of data (for deterministic sampling).
        """
        cols = [[], [], [], [], [], [], [], [], [], [], [], []]

        if chunk is None:
            indices = np.random.permutation(len(
                self.replay_memory))[:batch_size]
        else:
            start_idx = chunk * batch_size
            end_idx = start_idx + batch_size
            indices = range(start_idx, end_idx)

        for idx in indices:
            memory = self.replay_memory[idx]
            for col, value in zip(cols, memory):
                col.append(value)

        states = torch.tensor(cols[0], dtype=torch.float32)
        next_states = torch.tensor(cols[3], dtype=torch.float32)

        if model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
            num_possible_actions = len(cols[7][0])

            actions = torch.tensor(cols[1], dtype=torch.float32)
            possible_actions = []
            for pa_matrix in cols[8]:
                logger.info("PA" + str(pa_matrix))
                for row in pa_matrix:
                    possible_actions.append(row)

            tiled_states = states.repeat(1, num_possible_actions).reshape(
                -1, states.shape[1])
            possible_actions = torch.tensor(possible_actions,
                                            dtype=torch.float32)
            possible_actions_state_concat = torch.cat(
                (tiled_states, possible_actions), dim=1)
            possible_actions_mask = torch.tensor(cols[9], dtype=torch.float32)

            next_actions = torch.tensor(cols[4], dtype=torch.float32)
            possible_next_actions = []
            for pna_matrix in cols[6]:
                for row in pna_matrix:
                    logger.info("PNA" + str(row))
                    possible_next_actions.append(row)

            tiled_next_states = next_states.repeat(
                1, num_possible_actions).reshape(-1, next_states.shape[1])

            possible_next_actions = torch.tensor(possible_next_actions,
                                                 dtype=torch.float32)
            possible_next_actions_state_concat = torch.cat(
                (tiled_next_states, possible_next_actions), dim=1)
            possible_next_actions_mask = torch.tensor(cols[7],
                                                      dtype=torch.float32)
        else:
            possible_actions = None
            possible_actions_state_concat = None
            possible_next_actions = None
            possible_next_actions_state_concat = None
            if cols[7] is None or cols[7][0] is None:
                possible_next_actions_mask = None
            else:
                possible_next_actions_mask = torch.tensor(cols[7],
                                                          dtype=torch.float32)
            if cols[9] is None or cols[9][0] is None:
                possible_actions_mask = None
            else:
                possible_actions_mask = torch.tensor(cols[9],
                                                     dtype=torch.float32)

            actions = torch.tensor(cols[1], dtype=torch.float32)
            next_actions = torch.tensor(cols[4], dtype=torch.float32)

        return TrainingDataPage(
            states=states,
            actions=actions,
            propensities=None,
            rewards=torch.tensor(cols[2], dtype=torch.float32).reshape(-1, 1),
            next_states=next_states,
            next_actions=next_actions,
            not_terminal=torch.from_numpy(
                np.logical_not(np.array(cols[5]),
                               dtype=np.bool).astype(np.int32)).reshape(-1, 1),
            time_diffs=torch.tensor(cols[10],
                                    dtype=torch.int32).reshape(-1, 1),
            possible_actions_mask=possible_actions_mask,
            possible_actions_state_concat=possible_actions_state_concat,
            possible_next_actions_mask=possible_next_actions_mask,
            possible_next_actions_state_concat=
            possible_next_actions_state_concat,
        )
Ejemplo n.º 25
0
    def preprocess(self, batch) -> TrainingDataPage:
        tdp = super().preprocess(batch)

        # Preprocess action features for parametric action DQN
        sorted_action_features, _ = (
            self.action_preprocessor._sort_features_by_normalization())
        sorted_action_features_str = [str(x) for x in sorted_action_features]
        actions = self.sparse_to_dense_processor(sorted_action_features_str,
                                                 batch["action"])

        actions = self.action_preprocessor.forward(actions)
        next_actions = self.sparse_to_dense_processor(
            sorted_action_features_str, batch["next_action"])
        next_actions = self.action_preprocessor.forward(next_actions)

        max_action_size = max(
            len(pna) for pna in batch["possible_next_actions"])

        pas_mask = torch.Tensor([
            ([1] * len(l) + [0] * (max_action_size - len(l)))
            for l in batch["possible_actions"]
        ])

        pnas_mask = torch.Tensor([
            ([1] * len(l) + [0] * (max_action_size - len(l)))
            for l in batch["possible_next_actions"]
        ])
        flat_pnas: List[Dict[int, float]] = []
        for pa in batch["possible_next_actions"]:
            flat_pnas.extend(pa)
            for _ in range(max_action_size - len(pa)):
                flat_pnas.append({})

        not_terminal = torch.from_numpy(
            np.array([len(pna) > 0 for pna in batch["possible_next_actions"]
                      ]).astype(np.float32)).reshape(-1, 1)
        pnas = self.sparse_to_dense_processor(sorted_action_features_str,
                                              flat_pnas)
        pnas = self.action_preprocessor.forward(pnas)
        tiled_next_state_features_dense = tdp.next_states.repeat(
            1, max_action_size).reshape(-1, tdp.next_states.shape[1])

        possible_next_state_actions = torch.cat(
            (tiled_next_state_features_dense, pnas.cpu()), dim=1)

        pas_mask = torch.Tensor([
            ([1] * len(l) + [0] * (max_action_size - len(l)))
            for l in batch["possible_actions"]
        ])
        flat_pas: List[Dict[int, float]] = []
        for pa in batch["possible_actions"]:
            flat_pas.extend(pa)
            for _ in range(max_action_size - len(pa)):
                flat_pas.append({})
        pas = self.sparse_to_dense_processor(sorted_action_features_str,
                                             flat_pas)
        pas = self.action_preprocessor.forward(pas)

        tiled_state_features_dense = tdp.states.repeat(
            1, max_action_size).reshape(-1, tdp.states.shape[1])

        possible_state_actions = torch.cat(
            (tiled_state_features_dense, pas.cpu()), dim=1)

        return TrainingDataPage(
            mdp_ids=tdp.mdp_ids,
            sequence_numbers=tdp.sequence_numbers,
            states=tdp.states,
            actions=actions,
            propensities=tdp.propensities,
            rewards=tdp.rewards,
            possible_actions_mask=pas_mask,
            next_states=tdp.next_states,
            next_actions=next_actions,
            possible_next_actions_mask=pnas_mask,
            not_terminal=not_terminal,
            time_diffs=tdp.time_diffs,
            possible_actions_state_concat=possible_state_actions,
            possible_next_actions_state_concat=possible_next_state_actions,
            max_num_actions=max_action_size,
        )
Ejemplo n.º 26
0
def preprocess_batch_for_training(state_preprocessor,
                                  batch,
                                  action_names=None,
                                  action_preprocessor=None):

    assert (action_names is None) ^ (
        action_preprocessor is None
    ), "Either action_names should be None xor action_preprocessor should be None"

    # Preprocess state features
    sorted_state_features, _ = state_preprocessor._sort_features_by_normalization(
    )
    sorted_state_features_str = [str(x) for x in sorted_state_features]
    state_features_dense = pandas_sparse_to_dense(sorted_state_features_str,
                                                  batch["state_features"])
    next_state_features_dense = pandas_sparse_to_dense(
        sorted_state_features_str, batch["next_state_features"])
    state_features_dense = state_preprocessor.forward(state_features_dense)
    next_state_features_dense = state_preprocessor.forward(
        next_state_features_dense)

    mdp_ids = np.array(batch["mdp_id"])
    sequence_numbers = np.array(batch["sequence_number"], dtype=np.int32)
    rewards = np.array(batch["reward"], dtype=np.float32)
    time_diffs = np.array(batch["time_diff"], dtype=np.int32)
    episode_values = np.array(batch["episode_value"], dtype=np.float32)

    if action_preprocessor:
        # Preprocess action features for parametric action DQN
        sorted_action_features, _ = (
            action_preprocessor._sort_features_by_normalization())
        sorted_action_features_str = [str(x) for x in sorted_action_features]
        actions = pandas_sparse_to_dense(sorted_action_features_str,
                                         batch["action"])
        actions = action_preprocessor.forward(actions)

        if "possible_next_actions" not in batch.keys():
            # DDPG
            not_terminals = np.array(batch["next_action"], dtype=np.bool) * 1
            pnas, pnas_lens, possible_next_state_actions = None, None, None
            pas, pas_lens, possible_state_actions = None, None, None
        else:
            # Parametric DQN
            pnas_lens = np.array(
                [len(l) for l in batch["possible_next_actions"]])
            flat_pnas = list(
                itertools.chain.from_iterable(batch["possible_next_actions"]))
            not_terminals = pnas_lens.astype(np.bool)
            pnas = pandas_sparse_to_dense(sorted_action_features_str,
                                          flat_pnas)
            pnas = action_preprocessor.forward(pnas)
            tiled_next_state_features_dense = np.repeat(
                next_state_features_dense, pnas_lens, axis=0)
            possible_next_state_actions = torch.cat(
                (tiled_next_state_features_dense, pnas), dim=1)
            pas_lens = np.array([len(l) for l in batch["possible_actions"]])
            flat_pas = list(
                itertools.chain.from_iterable(batch["possible_actions"]))
            pas = pandas_sparse_to_dense(sorted_action_features_str, flat_pas)
            pas = action_preprocessor.forward(pas)
            tiled_state_features_dense = np.repeat(state_features_dense,
                                                   pas_lens,
                                                   axis=0)
            possible_state_actions = torch.cat(
                (tiled_state_features_dense, pas), dim=1)
    else:
        actions = read_actions(action_names, batch["action"])
        pnas = np.array(batch["possible_next_actions"], dtype=np.float32)
        not_terminals = np.max(pnas, 1).astype(np.bool)
        pnas_lens, possible_next_state_actions = None, None
        pas, pas_lens, possible_state_actions = None, None, None
    if "propensity" in batch:
        propensities = np.array(batch["propensity"], dtype=np.float32)
    else:
        propensities = np.ones(shape=rewards.shape, dtype=np.float32)

    return TrainingDataPage(
        mdp_ids=mdp_ids,
        sequence_numbers=sequence_numbers,
        states=state_features_dense,
        actions=actions,
        propensities=propensities,
        rewards=rewards,
        possible_actions=pas,
        possible_actions_lengths=pas_lens,
        next_states=next_state_features_dense,
        possible_next_actions=pnas,
        possible_next_actions_lengths=pnas_lens,
        episode_values=episode_values,
        not_terminals=not_terminals,
        time_diffs=time_diffs,
        state_pas_concat=possible_state_actions,
        next_state_pnas_concat=possible_next_state_actions,
    )
Ejemplo n.º 27
0
    def preprocess_samples_discrete(
        self,
        states: List[Dict[str, float]],
        actions: List[str],
        rewards: List[float],
        next_states: List[Dict[str, float]],
        next_actions: List[str],
        is_terminals: List[bool],
        possible_next_actions: List[List[str]],
        reward_timelines: Optional[List[Dict[int, float]]],
    ) -> TrainingDataPage:
        # Shuffle
        if reward_timelines is None:
            merged = list(
                zip(states, actions, rewards, next_states, next_actions,
                    is_terminals, possible_next_actions))
            random.shuffle(merged)
            states, actions, rewards, next_states, next_actions, \
                is_terminals, possible_next_actions = zip(*merged)
        else:
            merged = list(
                zip(states, actions, rewards, next_states, next_actions,
                    is_terminals, possible_next_actions, reward_timelines))
            random.shuffle(merged)
            states, actions, rewards, next_states, next_actions, \
                is_terminals, possible_next_actions, reward_timelines = zip(*merged)

        x = []
        for state in states:
            a = [0.0] * self.num_states
            a[int(list(state.keys())[0])] = float(list(state.values())[0])
            x.append(a)
        states = np.array(x, dtype=np.float32)
        x = []
        for state in next_states:
            a = [0.0] * self.num_states
            a[int(list(state.keys())[0])] = float(list(state.values())[0])
            x.append(a)
        next_states = np.array(x, dtype=np.float32)
        actions_one_hot = np.zeros(
            [len(actions), len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(actions):
            actions_one_hot[i, self.ACTIONS.index(action)] = 1
        rewards = np.array(rewards, dtype=np.float32)
        next_actions_one_hot = np.zeros(
            [len(next_actions), len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(next_actions):
            if action == '':
                continue
            next_actions_one_hot[i, self.ACTIONS.index(action)] = 1
        possible_next_actions_mask = []
        for pna in possible_next_actions:
            pna_mask = [0] * self.num_actions
            for action in pna:
                pna_mask[self.ACTIONS.index(action)] = 1
            possible_next_actions_mask.append(pna_mask)
        possible_next_actions_mask = np.array(possible_next_actions_mask,
                                              dtype=np.float32)
        is_terminals = np.array(is_terminals, dtype=np.bool)
        if reward_timelines is not None:
            reward_timelines = np.array(reward_timelines, dtype=np.object)

        return TrainingDataPage(
            states=states,
            actions=actions_one_hot,
            rewards=rewards,
            next_states=next_states,
            next_actions=next_actions_one_hot,
            possible_next_actions=possible_next_actions_mask,
            reward_timelines=reward_timelines,
        )
Ejemplo n.º 28
0
    def sample_memories(self, batch_size, model_type, chunk=None):
        """
        Samples transitions from replay memory uniformly at random by default
        or pass chunk for deterministic sample.

        *Note*: 1-D vectors such as state & action get stacked to make a 2-D
        matrix, while a 2-D matrix such as possible_actions (in the parametric
        case) get concatenated to make a bigger 2-D matrix

        :param batch_size: Number of sampled transitions to return.
        :param model_type: Model type (discrete, parametric).
        :param chunk: Index of chunk of data (for deterministic sampling).
        """
        if chunk is None:
            indices = torch.randint(0, self.size, size=(batch_size, ))
        else:
            start_idx = chunk * batch_size
            end_idx = start_idx + batch_size
            indices = range(start_idx, end_idx)

        memory = self.memory_buffer.slice(indices)

        states = memory.state
        next_states = memory.next_state

        assert states.dim() == 2
        assert next_states.dim() == 2

        if model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
            num_possible_actions = memory.possible_actions_mask.shape[1]

            actions = memory.action
            next_actions = memory.next_action

            tiled_states = states.repeat(1, num_possible_actions).reshape(
                -1, states.shape[1])
            possible_actions = memory.possible_actions.reshape(
                -1, actions.shape[1])
            possible_actions_state_concat = torch.cat(
                (tiled_states, possible_actions), dim=1)
            possible_actions_mask = memory.possible_actions_mask

            tiled_next_states = next_states.repeat(
                1, num_possible_actions).reshape(-1, next_states.shape[1])
            possible_next_actions = memory.possible_next_actions.reshape(
                -1, actions.shape[1])
            possible_next_actions_state_concat = torch.cat(
                (tiled_next_states, possible_next_actions), dim=1)
            possible_next_actions_mask = memory.possible_next_actions_mask
        else:
            possible_actions = None
            possible_actions_state_concat = None
            possible_next_actions = None
            possible_next_actions_state_concat = None
            possible_next_actions_mask = memory.possible_next_actions_mask
            possible_actions_mask = memory.possible_actions_mask

            actions = memory.action
            next_actions = memory.next_action

            assert len(actions.size()) == 2
            assert len(next_actions.size()) == 2

        rewards = memory.reward
        not_terminal = 1 - memory.terminal
        time_diffs = memory.time_diff

        return TrainingDataPage(
            states=states,
            actions=actions,
            propensities=None,
            rewards=rewards,
            next_states=next_states,
            next_actions=next_actions,
            not_terminal=not_terminal,
            time_diffs=time_diffs,
            possible_actions_mask=possible_actions_mask,
            possible_actions_state_concat=possible_actions_state_concat,
            possible_next_actions_mask=possible_next_actions_mask,
            possible_next_actions_state_concat=
            possible_next_actions_state_concat,
        )
Ejemplo n.º 29
0
    def train(self,
              training_samples: TrainingDataPage,
              evaluator=None,
              episode_values=None) -> None:
        training_samples.set_type(self.dtype)

        if self.minibatch == 0:
            # Assume that the tensors are the right shape after the first minibatch
            assert training_samples.states.shape[0] == self.minibatch_size, (
                "Invalid shape: " + str(training_samples.states.shape))
            assert training_samples.actions.shape[0] == self.minibatch_size, (
                "Invalid shape: " + str(training_samples.actions.shape))
            assert training_samples.rewards.shape == torch.Size(
                [self.minibatch_size,
                 1]), "Invalid shape: " + str(training_samples.rewards.shape)
            assert (training_samples.episode_values is None
                    or training_samples.episode_values.shape
                    == training_samples.rewards.shape), (
                        "Invalid shape: " +
                        str(training_samples.episode_values.shape))
            assert (training_samples.next_states.shape == training_samples.
                    states.shape), ("Invalid shape: " +
                                    str(training_samples.next_states.shape))
            assert (training_samples.not_terminals.shape == training_samples.
                    rewards.shape), ("Invalid shape: " +
                                     str(training_samples.not_terminals.shape))
            assert training_samples.possible_next_actions_state_concat.shape[
                1] == (
                    training_samples.states.shape[1] +
                    training_samples.actions.shape[1]
                ), ("Invalid shape: " + str(
                    training_samples.possible_next_actions_state_concat.shape))
            assert training_samples.possible_next_actions_lengths.shape == torch.Size(
                [
                    self.minibatch_size
                ]), ("Invalid shape: " +
                     str(training_samples.possible_next_actions_lengths.shape))

        self.minibatch += 1

        states = training_samples.states.detach().requires_grad_(True)
        actions = training_samples.actions
        state_action_pairs = torch.cat((states, actions), dim=1)

        rewards = training_samples.rewards
        discount_tensor = torch.full(training_samples.time_diffs.shape,
                                     self.gamma).type(self.dtype)
        not_done_mask = training_samples.not_terminals

        if self.use_seq_num_diff_as_time_diff:
            discount_tensor = discount_tensor.pow(training_samples.time_diffs)

        if self.maxq_learning:
            # Compute max a' Q(s', a') over all possible actions using target network
            next_q_values = self.get_max_q_values(
                training_samples.possible_next_actions_state_concat,
                training_samples.possible_next_actions_lengths,
                self.double_q_learning,
            )
        else:
            # SARSA
            next_state_action_pairs = torch.cat(
                (training_samples.next_states, training_samples.next_actions),
                dim=1)
            next_q_values = self.get_next_action_q_values(
                next_state_action_pairs)

        filtered_max_q_vals = next_q_values.reshape(-1, 1) * not_done_mask

        if self.use_reward_burnin and self.minibatch < self.reward_burnin:
            target_q_values = rewards
        else:
            target_q_values = rewards + (discount_tensor * filtered_max_q_vals)

        # Get Q-value of action taken
        q_values = self.q_network(state_action_pairs)
        self.all_action_scores = q_values.detach()

        value_loss = self.q_network_loss(q_values, target_q_values)
        self.loss = value_loss.detach()

        self.q_network_optimizer.zero_grad()
        value_loss.backward()
        if self.gradient_handler:
            self.gradient_handler(self.q_network.parameters())
        self.q_network_optimizer.step()

        if self.use_reward_burnin and self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.q_network, self.q_network_target, 1.0)
        else:
            # Use the soft update rule to update target network
            self._soft_update(self.q_network, self.q_network_target, self.tau)

        # get reward estimates
        reward_estimates = self.reward_network(state_action_pairs)
        reward_loss = F.mse_loss(reward_estimates, rewards)
        self.reward_network_optimizer.zero_grad()
        reward_loss.backward()
        self.reward_network_optimizer.step()

        if evaluator is not None:
            self.evaluate(
                evaluator,
                training_samples.actions,
                training_samples.propensities,
                training_samples.episode_values,
            )
Ejemplo n.º 30
0
    def sample_memories(self, batch_size, model_type, chunk=None):
        """
        Samples transitions from replay memory uniformly at random by default
        or pass chunk for deterministic sample.

        *Note*: 1-D vectors such as state & action get stacked to make a 2-D
        matrix, while a 2-D matrix such as possible_actions (in the parametric
        case) get concatenated to make a bigger 2-D matrix

        :param batch_size: Number of sampled transitions to return.
        :param model_type: Model type (discrete, parametric).
        :param chunk: Index of chunk of data (for deterministic sampling).
        """
        cols = [[], [], [], [], [], [], [], [], [], [], [], []]

        if chunk is None:
            indices = np.random.randint(0,
                                        len(self.replay_memory),
                                        size=batch_size)
        else:
            start_idx = chunk * batch_size
            end_idx = start_idx + batch_size
            indices = range(start_idx, end_idx)

        for idx in indices:
            memory = self.replay_memory[idx]
            for col, value in zip(cols, memory):
                col.append(value)

        states = stack(cols[0])
        next_states = stack(cols[3])

        assert states.dim() == 2
        assert next_states.dim() == 2

        if model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
            num_possible_actions = len(cols[7][0])

            actions = stack(cols[1])
            next_actions = stack(cols[4])

            tiled_states = states.repeat(1, num_possible_actions).reshape(
                -1, states.shape[1])
            possible_actions = torch.cat(cols[8])
            possible_actions_state_concat = torch.cat(
                (tiled_states, possible_actions), dim=1)
            possible_actions_mask = stack(cols[9])

            tiled_next_states = next_states.repeat(
                1, num_possible_actions).reshape(-1, next_states.shape[1])
            possible_next_actions = torch.cat(cols[6])
            possible_next_actions_state_concat = torch.cat(
                (tiled_next_states, possible_next_actions), dim=1)
            possible_next_actions_mask = stack(cols[7])
        else:
            possible_actions = None
            possible_actions_state_concat = None
            possible_next_actions = None
            possible_next_actions_state_concat = None
            if cols[7] is None or cols[7][0] is None:
                possible_next_actions_mask = None
            else:
                possible_next_actions_mask = stack(cols[7])
            if cols[9] is None or cols[9][0] is None:
                possible_actions_mask = None
            else:
                possible_actions_mask = stack(cols[9])

            actions = stack(cols[1])
            next_actions = stack(cols[4])

            assert len(actions.size()) == 2
            assert len(next_actions.size()) == 2

        rewards = torch.tensor(cols[2], dtype=torch.float32).reshape(-1, 1)
        not_terminal = (1 - torch.tensor(cols[5], dtype=torch.int32)).reshape(
            -1, 1)
        time_diffs = torch.tensor(cols[10], dtype=torch.int32).reshape(-1, 1)

        return TrainingDataPage(
            states=states,
            actions=actions,
            propensities=None,
            rewards=rewards,
            next_states=next_states,
            next_actions=next_actions,
            not_terminal=not_terminal,
            time_diffs=time_diffs,
            possible_actions_mask=possible_actions_mask,
            possible_actions_state_concat=possible_actions_state_concat,
            possible_next_actions_mask=possible_next_actions_mask,
            possible_next_actions_state_concat=
            possible_next_actions_state_concat,
        )
Ejemplo n.º 31
0
    def preprocess_samples_discrete(
            self,
            samples: Samples,
            minibatch_size: int,
            one_hot_action: bool = True) -> List[TrainingDataPage]:
        logger.info("Shuffling...")
        samples.shuffle()
        logger.info("Preprocessing...")

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
            False,
        )
        workspace.RunNetOnce(net)

        logger.info("Converting to Torch...")
        actions_one_hot = torch.tensor((np.array(samples.actions).reshape(
            -1, 1) == np.array(self.ACTIONS)).astype(np.int64))
        actions = actions_one_hot.argmax(dim=1, keepdim=True)
        rewards = torch.tensor(samples.rewards,
                               dtype=torch.float32).reshape(-1, 1)
        action_probabilities = torch.tensor(samples.action_probabilities,
                                            dtype=torch.float32).reshape(
                                                -1, 1)
        next_actions_one_hot = torch.tensor(
            (np.array(samples.next_actions).reshape(-1, 1) == np.array(
                self.ACTIONS)).astype(np.int64))
        logger.info("Converting PNA to Torch...")
        possible_next_action_strings = np.array(
            list(
                itertools.zip_longest(*samples.possible_next_actions,
                                      fillvalue=""))).T
        possible_next_actions_mask = torch.zeros(
            [len(samples.next_actions),
             len(self.ACTIONS)])
        for i, action in enumerate(self.ACTIONS):
            possible_next_actions_mask[:, i] = torch.tensor(
                np.max(possible_next_action_strings == action,
                       axis=1).astype(np.int64))
        terminals = torch.tensor(samples.terminals,
                                 dtype=torch.int32).reshape(-1, 1)
        not_terminals = 1 - terminals
        episode_values = None
        logger.info("Converting RT to Torch...")
        episode_values = torch.tensor(samples.episode_values,
                                      dtype=torch.float32).reshape(-1, 1)

        time_diffs = torch.ones([len(samples.states), 1])

        logger.info("Preprocessing...")
        preprocessor = Preprocessor(self.normalization, False)

        states_ndarray = workspace.FetchBlob(state_matrix)
        states_ndarray = preprocessor.forward(states_ndarray)

        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_states_ndarray = preprocessor.forward(next_states_ndarray)

        logger.info("Batching...")
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_one_hot[start:end]
                if one_hot_action else actions[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                not_terminals=not_terminals[start:end],
                next_actions=next_actions_one_hot[start:end],
                possible_next_actions=possible_next_actions_mask[start:end],
                episode_values=episode_values[start:end]
                if episode_values is not None else None,
                time_diffs=time_diffs[start:end],
            )
            tdp.set_type(torch.FloatTensor)
            tdps.append(tdp)
        return tdps
Ejemplo n.º 32
0
    def preprocess_samples_discrete(
        self,
        samples: Samples,
        minibatch_size: int,
        one_hot_action: bool = True,
        use_gpu: bool = False,
    ) -> List[TrainingDataPage]:
        logger.info("Shuffling...")
        samples = shuffle_samples(samples)
        logger.info("Preprocessing...")

        if self.sparse_to_dense_net is None:
            self.sparse_to_dense_net = core.Net("gridworld_sparse_to_dense")
            C2.set_net(self.sparse_to_dense_net)
            saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
            sorted_features, _ = sort_features_by_normalization(self.normalization)
            self.state_matrix, _ = sparse_to_dense(
                saa.lengths, saa.keys, saa.values, sorted_features
            )
            saa = StackedAssociativeArray.from_dict_list(
                samples.next_states, "next_states"
            )
            self.next_state_matrix, _ = sparse_to_dense(
                saa.lengths, saa.keys, saa.values, sorted_features
            )
            C2.set_net(None)
        else:
            StackedAssociativeArray.from_dict_list(samples.states, "states")
            StackedAssociativeArray.from_dict_list(samples.next_states, "next_states")
        workspace.RunNetOnce(self.sparse_to_dense_net)

        logger.info("Converting to Torch...")
        actions_one_hot = torch.tensor(
            (np.array(samples.actions).reshape(-1, 1) == np.array(self.ACTIONS)).astype(
                np.int64
            )
        )
        actions = actions_one_hot.argmax(dim=1, keepdim=True)
        rewards = torch.tensor(samples.rewards, dtype=torch.float32).reshape(-1, 1)
        action_probabilities = torch.tensor(
            samples.action_probabilities, dtype=torch.float32
        ).reshape(-1, 1)
        next_actions_one_hot = torch.tensor(
            (
                np.array(samples.next_actions).reshape(-1, 1) == np.array(self.ACTIONS)
            ).astype(np.int64)
        )
        logger.info("Converting PA to Torch...")
        possible_action_strings = np.array(
            list(itertools.zip_longest(*samples.possible_actions, fillvalue=""))
        ).T
        possible_actions_mask = torch.zeros([len(samples.actions), len(self.ACTIONS)])
        for i, action in enumerate(self.ACTIONS):
            possible_actions_mask[:, i] = torch.tensor(
                np.max(possible_action_strings == action, axis=1).astype(np.int64)
            )
        logger.info("Converting PNA to Torch...")
        possible_next_action_strings = np.array(
            list(itertools.zip_longest(*samples.possible_next_actions, fillvalue=""))
        ).T
        possible_next_actions_mask = torch.zeros(
            [len(samples.next_actions), len(self.ACTIONS)]
        )
        for i, action in enumerate(self.ACTIONS):
            possible_next_actions_mask[:, i] = torch.tensor(
                np.max(possible_next_action_strings == action, axis=1).astype(np.int64)
            )
        terminals = torch.tensor(samples.terminals, dtype=torch.int32).reshape(-1, 1)
        not_terminal = 1 - terminals
        logger.info("Converting RT to Torch...")

        time_diffs = torch.ones([len(samples.states), 1])

        logger.info("Preprocessing...")
        preprocessor = Preprocessor(self.normalization, False)

        states_ndarray = workspace.FetchBlob(self.state_matrix)
        states_ndarray = preprocessor.forward(states_ndarray)

        next_states_ndarray = workspace.FetchBlob(self.next_state_matrix)
        next_states_ndarray = preprocessor.forward(next_states_ndarray)

        logger.info("Batching...")
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_one_hot[start:end]
                if one_hot_action
                else actions[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                not_terminal=not_terminal[start:end],
                next_actions=next_actions_one_hot[start:end],
                possible_actions_mask=possible_actions_mask[start:end],
                possible_next_actions_mask=possible_next_actions_mask[start:end],
                time_diffs=time_diffs[start:end],
            )
            tdp.set_type(torch.cuda.FloatTensor if use_gpu else torch.FloatTensor)
            tdps.append(tdp)
        return tdps