Exemple #1
0
    def __init__(self,
                 state_normalization_parameters: Dict[str,
                                                      NormalizationParameters],
                 parameters: Union[DiscreteActionModelParameters,
                                   ContinuousActionModelParameters],
                 skip_normalization: Optional[bool] = False) -> None:
        print(state_normalization_parameters)
        print(parameters)

        self._state_normalization_parameters = state_normalization_parameters
        MLTrainer.__init__(self, "rl_trainer", parameters.training)

        self.target_network = TargetNetwork(self,
                                            parameters.rl.target_update_rate)

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma

        self.training_iteration = 0
        self._buffers = None
        self.minibatch_size = parameters.training.minibatch_size

        self.skip_normalization = skip_normalization
        self._prepare_state_normalization()
Exemple #2
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))

        assert parameters.training.layers[0] >= 0,\
            "Set layers[0] to a the number of features"

        self.num_features = parameters.training.layers[0]

        MLTrainer.__init__(self, RL_TRAINER_MODEL_ID, parameters.training)

        self.target_network = TargetNetwork(
            self, parameters.rl.target_update_rate
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob('states', np.array([0], dtype=np.float32))
        workspace.FeedBlob('actions', np.array([0], dtype=np.float32))
        workspace.FeedBlob('rewards', np.array([0], dtype=np.float32))
        workspace.FeedBlob('next_states', np.array([0], dtype=np.float32))
        workspace.FeedBlob('not_terminals', np.array([0], dtype=np.float32))
        workspace.FeedBlob('next_actions', np.array([0], dtype=np.float32))
        workspace.FeedBlob(
            'possible_next_actions', np.array([0], dtype=np.float32)
        )
        workspace.FeedBlob(
            'possible_next_actions_lengths', np.array([0], dtype=np.float32)
        )

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None
Exemple #3
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))

        assert parameters.training.layers[0] >= 0,\
            "Set layers[0] to a the number of features"

        self.num_features = parameters.training.layers[0]

        MLTrainer.__init__(self, "rl_trainer", parameters.training)

        self.target_network = TargetNetwork(self,
                                            parameters.rl.target_update_rate)

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma

        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
Exemple #4
0
    def __init__(
        self,
        fc_parameters: DiscreteActionModelParameters,
        cnn_parameters: CNNModelParameters,
        img_height: int,
        img_width: int,
    ) -> None:
        MLConvTrainer.__init__(self, "ml_conv_trainer", fc_parameters.training,
                               cnn_parameters, img_height, img_width)

        self.target_network = TargetNetwork(
            self, fc_parameters.rl.target_update_rate)

        self.reward_burnin = fc_parameters.rl.reward_burnin
        self.maxq_learning = fc_parameters.rl.maxq_learning
        self.rl_discount_rate = fc_parameters.rl.gamma

        self.training_iteration = 0
        self._buffers = None
        self.minibatch_size = fc_parameters.training.minibatch_size

        self.skip_normalization = True
Exemple #5
0
class RLTrainer(MLTrainer):
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))

        assert parameters.training.layers[0] >= 0,\
            "Set layers[0] to a the number of features"

        self.num_features = parameters.training.layers[0]

        MLTrainer.__init__(self, "rl_trainer", parameters.training)

        self.target_network = TargetNetwork(self,
                                            parameters.rl.target_update_rate)

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma

        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size

    @property
    def sarsa(self) -> bool:
        """
        Returns whether or not this trainer generates target values using SARSA.
        """
        return not self.maxq_learning

    def stream_tdp(self,
                   tdp: TrainingDataPage,
                   evaluator: Optional[Evaluator] = None) -> None:
        """
        Loads a large batch of transitions from a page of training data. This
        batch will be further broken down into minibatches for training.

        :param tdp: TrainingDataPage object that supplies transitions.
        :param evaluator: Evaluator object to record TD and compute MC losses.
        """
        raise NotImplementedError()

    def get_max_q_values(self, next_states: np.ndarray,
                         possible_next_actions) -> np.ndarray:
        """
        Takes in an array of next_states and outputs an array of the same shape
        whose ith entry = max_{pna} Q(state_i, pna). Uses target network for
        Q(state_i, pna) approximation.

        :param next_states: Numpy array with shape (batch_size, state_dim). Each
            row contains a representation of a state.
        :param possible_next_actions: See subclass' `get_max_q_values` documentation.
        """
        raise NotImplementedError()

    def get_sarsa_values(self, next_states: np.ndarray,
                         next_actions: np.ndarray) -> np.ndarray:
        """
        Takes in a set of next_states and corresponding next_actions. For each
        (next_state_i, next_action_i) pair, calculates Q(next_state, next_action).
        Returns these q values in a Numpy array of shape (batch_size, 1).

        :param next_states: Numpy array with shape (batch_size, state_dim). The
            ith row is a representation of the ith transition's next_state.
        :param next_actions: See subclass' `get_sarsa_values` documentation.
        """
        raise NotImplementedError()

    def update_model(self, states: np.ndarray, actions: np.ndarray,
                     q_vals_target: np.ndarray) -> None:
        """
        Takes in states, actions, and target q values. Updates the model:
            Runs the forward pass, computing Q(states, actions).
                Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
            Comptutes Loss of Q(states, actions) with respect to q_vals_targets.
            Updates Q Network's weights according to loss and optimizer.

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row is a representation of the ith transition's action.
        :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith
            row is the label to train against for the data from the ith transition.
        """
        raise NotImplementedError()

    def stream(self, states, actions, rewards, next_states, next_actions,
               not_terminals, possible_next_actions, reward_timelines,
               evaluator):
        """
        Load large batch as training set. This batch will be broken down into
        minibatches. Assumes that states, next_states, and actions (in the
        parametric action case) need no further normalization.
        """

        assert rewards.ndim == 2
        assert not_terminals.ndim == 2

        page_size = states.shape[0]
        assert page_size == self.minibatch_size

        self.train(
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            not_terminals,
            possible_next_actions,
        )
        if evaluator is not None:
            evaluator.report(
                reward_timelines,
                self.get_q_values(states, actions),
                workspace.FetchBlob(self.loss_blob),
            )

    def train(
        self,
        states: np.ndarray,
        actions: np.ndarray,
        rewards: np.ndarray,
        next_states: np.ndarray,
        next_actions: Optional[np.ndarray],
        not_terminals: np.ndarray,
        possible_next_actions,
    ) -> None:
        """
        Takes in a batch of transitions. For transition i, calculates target qval:
            next_q_values_i = {
                max_{pna_i} Q(next_state_i, pna_i), self.maxq_learning
                Q(next_state_i, next_action_i), self.sarsa
            }
            q_val_target_i = {
                r_i + gamma * next_q_values_i, not_terminals_i
                r_i, !not_terminals_i
            }
        Trains Q Network on the q_val_targets as labels.

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: See subclass' `train` documentation.
        :param rewards: Numpy array with shape (batch_size, 1). The ith entry is
            the reward experienced at the ith transition.
        :param not_terminals: Numpy array with shape (batch_size, 1). The ith entry
            is equal to 1 iff the ith transition's state is not terminal.
        :param next_states: Numpy array with shape (batch_size, state_dim). The
            ith row is a representation of the ith transition's next state.
        :param next_actions: See subclass' `train` documentation.
        :param possible_next_actions: See subclass' `train` documentation.
        """

        batch_size = self.minibatch_size
        assert rewards.shape == (
            batch_size, 1
        ), "Invalid reward shape: " + \
            str(rewards.shape) + " != " + str(self.minibatch_size)
        assert rewards.dtype == np.float32
        assert not_terminals.shape == (
            batch_size, 1), 'terminals invalid ' + str(not_terminals.shape)

        q_vals_target = np.copy(rewards)
        if self.training_iteration >= self.reward_burnin:
            if self.training_iteration == self.reward_burnin:
                logger.info(
                    "Minibatch number == reward_burnin. Starting RL updates.")
            if self.maxq_learning:
                next_q_values = self.get_max_q_values(next_states,
                                                      possible_next_actions)
            else:
                next_q_values = self.get_sarsa_values(next_states,
                                                      next_actions)

            q_vals_target += not_terminals * self.rl_discount_rate * next_q_values

        self.update_model(states, actions, q_vals_target)

        if self.training_iteration >= self.reward_burnin:
            self.target_network.enable_slow_updates()
        self.target_network.target_update()
        self.training_iteration += 1
Exemple #6
0
class RLTrainer(MLTrainer):
    def __init__(self,
                 state_normalization_parameters: Dict[str,
                                                      NormalizationParameters],
                 parameters: Union[DiscreteActionModelParameters,
                                   ContinuousActionModelParameters],
                 skip_normalization: Optional[bool] = False) -> None:
        print(state_normalization_parameters)
        print(parameters)

        self._state_normalization_parameters = state_normalization_parameters
        MLTrainer.__init__(self, "rl_trainer", parameters.training)

        self.target_network = TargetNetwork(self,
                                            parameters.rl.target_update_rate)

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma

        self.training_iteration = 0
        self._buffers = None
        self.minibatch_size = parameters.training.minibatch_size

        self.skip_normalization = skip_normalization
        self._prepare_state_normalization()

    def _normalize_states(self, states: np.ndarray) -> np.ndarray:
        """
        Normalizes input states and replaces NaNs with 0. Returns a matrix of
        the same shape. Make sure to have set up the underlying normalization net
        with `_prepare_state_normalization`.

        :param states: Numpy array with shape (batch_size, state_dim) containing
            raw state inputs
        """
        if self.skip_normalization:
            return states
        return normalize_dense_matrix(states, self._state_features,
                                      self._state_normalization_parameters,
                                      self.state_norm_blobs,
                                      self.state_norm_net,
                                      self.state_norm_blobname_template,
                                      self.num_state_features)

    def _prepare_state_normalization(self):
        """
        Sets up operators for action normalization net.
        """
        if self.skip_normalization:
            return
        self._state_features = list(
            self._state_normalization_parameters.keys())
        self.state_norm_net = core.Net("state_norm_net")
        self.state_norm_blobname_template = '{}_input_state'
        self.state_norm_blobs = prepare_normalization(
            self.state_norm_net, self._state_normalization_parameters,
            self._state_features, self.state_norm_blobname_template, True)

    def get_state_features(self) -> List[str]:
        return self._state_features

    @property
    def num_state_features(self) -> int:
        """
        Returns the number of features in each preprocessed state.
        """
        raise NotImplementedError()

    @property
    def sarsa(self) -> bool:
        """
        Returns whether or not this trainer generates target values using SARSA.
        """
        return not self.maxq_learning

    def predictor(self) -> RLPredictor:
        """
        Builds a Predictor using the networks undrlying this Trainer.
        """
        raise NotImplementedError()

    def stream_tdp(self,
                   tdp: TrainingDataPage,
                   evaluator: Optional[Evaluator] = None) -> None:
        """
        Loads a large batch of transitions from a page of training data. This
        batch will be further broken down into minibatches for training.

        :param tdp: TrainingDataPage object that supplies transitions.
        :param evaluator: Evaluator object to record TD and compute MC losses.
        """
        raise NotImplementedError()

    def get_max_q_values(self, next_states: np.ndarray,
                         possible_next_actions) -> np.ndarray:
        """
        Takes in an array of next_states and outputs an array of the same shape
        whose ith entry = max_{pna} Q(state_i, pna). Uses target network for
        Q(state_i, pna) approximation.

        :param next_states: Numpy array with shape (batch_size, state_dim). Each
            row contains a representation of a state.
        :param possible_next_actions: See subclass' `get_max_q_values` documentation.
        """
        raise NotImplementedError()

    def get_sarsa_values(self, next_states: np.ndarray,
                         next_actions: np.ndarray) -> np.ndarray:
        """
        Takes in a set of next_states and corresponding next_actions. For each
        (next_state_i, next_action_i) pair, calculates Q(next_state, next_action).
        Returns these q values in a Numpy array of shape (batch_size, 1).

        :param next_states: Numpy array with shape (batch_size, state_dim). The
            ith row is a representation of the ith transition's next_state.
        :param next_actions: See subclass' `get_sarsa_values` documentation.
        """
        raise NotImplementedError()

    def update_model(self, states: np.ndarray, actions: np.ndarray,
                     q_vals_target: np.ndarray) -> None:
        """
        Takes in states, actions, and target q values. Updates the model:
            Runs the forward pass, computing Q(states, actions).
                Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
            Comptutes Loss of Q(states, actions) with respect to q_vals_targets.
            Updates Q Network's weights according to loss and optimizer.

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row is a representation of the ith transition's action.
        :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith
            row is the label to train against for the data from the ith transition.
        """
        raise NotImplementedError()

    def _validate_train_inputs(
        self,
        states: np.ndarray,
        actions: np.ndarray,
        rewards: np.ndarray,
        next_states: np.ndarray,
        next_actions: Optional[np.ndarray],
        not_terminals: np.ndarray,
        possible_next_actions: np.ndarray,
    ):
        raise NotImplementedError()

    def stream(self, states, actions, rewards, next_states, next_actions,
               not_terminals, possible_next_actions, reward_timelines,
               evaluator):
        """
        Load large batch as training set. This batch will be broken down into
        minibatches. Assumes that states, next_states, and actions (in the
        parametric action case) need no further normalization.
        """

        if rewards.ndim == 1:
            rewards = rewards.reshape(-1, 1)
        if not_terminals.ndim == 1:
            not_terminals = not_terminals.reshape(-1, 1)

        use_next_actions = next_actions is not None and self.sarsa
        use_pna = possible_next_actions is not None and self.maxq_learning
        use_rt = reward_timelines is not None

        num_buffers = 8
        if self._buffers is not None and self._buffers[0].shape[0] > 0:
            actions = np.concatenate([self._buffers[0], actions])
            states = np.concatenate([self._buffers[1], states])
            rewards = np.concatenate([self._buffers[2], rewards])
            next_states = np.concatenate([self._buffers[3], next_states])
            if use_next_actions:
                next_actions = np.concatenate([self._buffers[4], next_actions])
            not_terminals = np.concatenate([self._buffers[5], not_terminals])
            if use_pna:
                possible_next_actions = np.concatenate(
                    [self._buffers[6], possible_next_actions])
            if use_rt:
                reward_timelines = np.concatenate(
                    [self._buffers[7], reward_timelines])

        self._buffers = None
        page_size = states.shape[0]

        for batch_start in range(0, page_size, self.minibatch_size):
            batch_end = batch_start + self.minibatch_size
            if page_size < batch_end:
                self._buffers = [[] for _ in range(num_buffers)]
                self._buffers[0] = actions[batch_start:]
                self._buffers[1] = states[batch_start:]
                self._buffers[2] = rewards[batch_start:]
                self._buffers[3] = next_states[batch_start:]
                if use_next_actions:
                    self._buffers[4] = next_actions[batch_start:]
                self._buffers[5] = not_terminals[batch_start:]
                if use_pna:
                    self._buffers[6] = possible_next_actions[batch_start:]
                if use_rt:
                    self._buffers[7] = reward_timelines[batch_start:]
            else:
                na_batch = (next_actions[batch_start:batch_end]
                            if use_next_actions else None)
                pna_batch = (possible_next_actions[batch_start:batch_end]
                             if use_pna else None)
                rt_batch = (reward_timelines[batch_start:batch_end]
                            if use_rt else None)
                states_batch = states[batch_start:batch_end]
                actions_batch = actions[batch_start:batch_end]
                self.train(states_batch, actions_batch,
                           rewards[batch_start:batch_end],
                           next_states[batch_start:batch_end], na_batch,
                           not_terminals[batch_start:batch_end], pna_batch)
                if evaluator is not None:
                    evaluator.report(
                        rt_batch, self.get_q_values(states_batch,
                                                    actions_batch),
                        workspace.FetchBlob(self.loss_blob))

    def train(self, states: np.ndarray, actions: np.ndarray,
              rewards: np.ndarray, next_states: np.ndarray,
              next_actions: Optional[np.ndarray], not_terminals: np.ndarray,
              possible_next_actions: Optional[List]) -> None:
        """
        Takes in a batch of transitions. For transition i, calculates target qval:
            next_q_values_i = {
                max_{pna_i} Q(next_state_i, pna_i), self.maxq_learning
                Q(next_state_i, next_action_i), self.sarsa
            }
            q_val_target_i = {
                r_i + gamma * next_q_values_i, not_terminals_i
                r_i, !not_terminals_i
            }
        Trains Q Network on the q_val_targets as labels.

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: See subclass' `train` documentation.
        :param rewards: Numpy array with shape (batch_size, 1). The ith entry is
            the reward experienced at the ith transition.
        :param not_terminals: Numpy array with shape (batch_size, 1). The ith entry
            is equal to 1 iff the ith transition's state is not terminal.
        :param next_states: Numpy array with shape (batch_size, state_dim). The
            ith row is a representation of the ith transition's next state.
        :param next_actions: See subclass' `train` documentation.
        :param possible_next_actions: See subclass' `train` documentation.
        """

        self._validate_train_inputs(states, actions, rewards, next_states,
                                    next_actions, not_terminals,
                                    possible_next_actions)

        batch_size = self.minibatch_size
        assert rewards.shape == (batch_size, 1)
        assert rewards.dtype == np.float32
        assert not_terminals.shape == (batch_size, 1)

        q_vals_target = np.copy(rewards)
        if self.training_iteration >= self.reward_burnin:
            if self.training_iteration == self.reward_burnin:
                logger.info(
                    "Minibatch number == reward_burnin. Starting RL updates.")
            if self.maxq_learning:
                next_q_values = self.get_max_q_values(next_states,
                                                      possible_next_actions)
            else:
                next_q_values = self.get_sarsa_values(next_states,
                                                      next_actions)

            q_vals_target += not_terminals * self.rl_discount_rate * next_q_values

        self.update_model(states, actions, q_vals_target)

        if self.training_iteration >= self.reward_burnin:
            self.target_network.enable_slow_updates()
        self.target_network.target_update()
        self.training_iteration += 1
Exemple #7
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))
        RLTrainer.num_trainers += 1
        self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers)

        if parameters.training.cnn_parameters is not None:
            self.conv_ml_trainer = ConvMLTrainer(
                CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
            )

            # The final layer of the conv net is the input to the fc net.
            parameters.training.layers[
                0] = self.conv_ml_trainer.get_output_size()

            self.conv_target_network = ConvTargetNetwork(
                CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
                parameters.rl.target_update_rate,
                self.conv_ml_trainer,
            )
        else:
            self.conv_ml_trainer = None
            self.conv_target_network = None

        assert (parameters.training.layers[0] >=
                0), "Set layers[0] to a the number of features"

        self.ml_trainer = MLTrainer(
            ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
            parameters.training)

        self.target_network = TargetNetwork(
            TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
            parameters.training,
            parameters.rl.target_update_rate,
            self.ml_trainer,
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob("states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("actions", np.array([0], dtype=np.float32))
        workspace.FeedBlob("rewards", np.array([0], dtype=np.float32))
        workspace.FeedBlob("next_states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32))
        if self.maxq_learning:
            workspace.FeedBlob("possible_next_actions",
                               np.array([0], dtype=np.float32))
            workspace.FeedBlob("possible_next_actions_lengths",
                               np.array([0], dtype=np.float32))
        else:
            workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32))
        # Setting to 1 serves as a 1 unit time_diff if not set by user
        workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32))

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None
Exemple #8
0
class RLTrainer:
    num_trainers = 0
    DEFAULT_TRAINING_NUM_WORKERS = 4

    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))
        RLTrainer.num_trainers += 1
        self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers)

        if parameters.training.cnn_parameters is not None:
            self.conv_ml_trainer = ConvMLTrainer(
                CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
            )

            # The final layer of the conv net is the input to the fc net.
            parameters.training.layers[
                0] = self.conv_ml_trainer.get_output_size()

            self.conv_target_network = ConvTargetNetwork(
                CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
                parameters.rl.target_update_rate,
                self.conv_ml_trainer,
            )
        else:
            self.conv_ml_trainer = None
            self.conv_target_network = None

        assert (parameters.training.layers[0] >=
                0), "Set layers[0] to a the number of features"

        self.ml_trainer = MLTrainer(
            ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
            parameters.training)

        self.target_network = TargetNetwork(
            TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
            parameters.training,
            parameters.rl.target_update_rate,
            self.ml_trainer,
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob("states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("actions", np.array([0], dtype=np.float32))
        workspace.FeedBlob("rewards", np.array([0], dtype=np.float32))
        workspace.FeedBlob("next_states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32))
        if self.maxq_learning:
            workspace.FeedBlob("possible_next_actions",
                               np.array([0], dtype=np.float32))
            workspace.FeedBlob("possible_next_actions_lengths",
                               np.array([0], dtype=np.float32))
        else:
            workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32))
        # Setting to 1 serves as a 1 unit time_diff if not set by user
        workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32))

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None

    def get_possible_next_actions(self):
        raise NotImplementedError()

    def get_max_q_values(self, next_states: str, possible_next_actions,
                         use_target_network: bool) -> str:
        """
        Takes in an array of next_states and outputs an array of the same shape
        whose ith entry = max_{pna} Q(state_i, pna). Uses target network for
        Q(state_i, pna) approximation.

        :param next_states: Numpy array with shape (batch_size, state_dim). Each
            row contains a representation of a state.
        :param possible_next_actions: See subclass' `get_max_q_values` documentation.
        """
        raise NotImplementedError()

    def get_q_values(self, states: str, actions: str,
                     use_target_network: bool) -> str:
        """
        Takes in a set of next_states and corresponding next_actions. For each
        (next_state_i, next_action_i) pair, calculates Q(next_state, next_action).
        Returns these q values in a Numpy array of shape (batch_size, 1).

        :param next_states: Numpy array with shape (batch_size, state_dim). The
            ith row is a representation of the ith transition's next_state.
        :param next_actions: See subclass' `get_sarsa_values` documentation.
        """
        raise NotImplementedError()

    def update_model(self, states: str, actions: str,
                     q_vals_target: str) -> None:
        """
        Takes in states, actions, and target q values. Updates the model:
            Runs the forward pass, computing Q(states, actions).
                Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
            Comptutes Loss of Q(states, actions) with respect to q_vals_targets.
            Updates Q Network's weights according to loss and optimizer.

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row is a representation of the ith transition's action.
        :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith
            row is the label to train against for the data from the ith transition.
        """
        raise NotImplementedError()

    def _create_reward_train_net(self) -> None:
        raise NotImplementedError()

    def _create_rl_train_net(self) -> None:
        raise NotImplementedError()

    def _create_q_score_net(self) -> None:
        self.q_score_model = ModelHelper(name="q_score_" + self.model_id)
        C2.set_model(self.q_score_model)
        self.q_score_output = self.get_q_values("states", "actions", True)
        workspace.RunNetOnce(self.q_score_model.param_init_net)
        self.q_score_model.net.Proto().num_workers = (
            RLTrainer.DEFAULT_TRAINING_NUM_WORKERS)
        self.q_score_model.net.Proto().type = "async_scheduling"
        workspace.CreateNet(self.q_score_model.net)
        C2.set_model(None)

    def train_numpy(self, tdp: TrainingDataPage,
                    evaluator: Optional[Evaluator]):
        workspace.FeedBlob("states", tdp.states)
        workspace.FeedBlob("actions", tdp.actions)
        workspace.FeedBlob("rewards", tdp.rewards)
        workspace.FeedBlob("next_states", tdp.next_states)
        workspace.FeedBlob("not_terminals", tdp.not_terminals)
        workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32))
        if self.maxq_learning:
            if isinstance(tdp.possible_next_actions, StackedArray):
                workspace.FeedBlob("possible_next_actions",
                                   tdp.possible_next_actions.values)
                workspace.FeedBlob("possible_next_actions_lengths",
                                   tdp.possible_next_actions.lengths)
            else:
                workspace.FeedBlob("possible_next_actions",
                                   tdp.possible_next_actions)
        else:
            workspace.FeedBlob("next_actions", tdp.next_actions)
        self.train()
        if evaluator is not None:
            self.evaluate(evaluator, tdp.actions, tdp.propensities,
                          tdp.episode_values)

    def train(self) -> None:
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None

        if self.training_iteration >= self.reward_burnin:
            if self.training_iteration == self.reward_burnin:
                logger.info(
                    "Minibatch number == reward_burnin. Starting RL updates.")
                self.target_network.enable_slow_updates()
                if self.conv_target_network:
                    self.conv_target_network.enable_slow_updates()
            workspace.RunNet(self.rl_train_model.net)
        else:
            workspace.RunNet(self.reward_train_model.net)

        workspace.RunNet(self.target_network._update_model.net)
        if self.conv_target_network:
            workspace.RunNet(self.conv_target_network._update_model.net)
        self.training_iteration += 1
        workspace.RunNet(self.q_score_model.net)

    def evaluate(
        self,
        evaluator: Optional[Evaluator],
        logged_actions: Optional[np.ndarray],
        logged_propensities: Optional[np.ndarray],
        logged_values: Optional[np.ndarray],
    ):
        raise NotImplementedError()

    def build_predictor(self, model, input_blob, output_blob) -> List[str]:
        retval: List[str] = []
        if self.conv_ml_trainer is not None:
            conv_output = model.net.NextBlob("conv_output")
            retval = self.conv_ml_trainer.build_predictor(
                model, input_blob, conv_output)
            conv_output_flat = model.net.NextBlob("conv_output_flat")
            model.net.Flatten([conv_output], [conv_output_flat])
            input_blob = conv_output_flat
        retval += self.ml_trainer.build_predictor(model, input_blob,
                                                  output_blob)
        return retval
Exemple #9
0
class RLTrainer(MLTrainer):
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))

        assert parameters.training.layers[0] >= 0,\
            "Set layers[0] to a the number of features"

        self.num_features = parameters.training.layers[0]

        MLTrainer.__init__(self, RL_TRAINER_MODEL_ID, parameters.training)

        self.target_network = TargetNetwork(
            self, parameters.rl.target_update_rate
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob('states', np.array([0], dtype=np.float32))
        workspace.FeedBlob('actions', np.array([0], dtype=np.float32))
        workspace.FeedBlob('rewards', np.array([0], dtype=np.float32))
        workspace.FeedBlob('next_states', np.array([0], dtype=np.float32))
        workspace.FeedBlob('not_terminals', np.array([0], dtype=np.float32))
        workspace.FeedBlob('next_actions', np.array([0], dtype=np.float32))
        workspace.FeedBlob(
            'possible_next_actions', np.array([0], dtype=np.float32)
        )
        workspace.FeedBlob(
            'possible_next_actions_lengths', np.array([0], dtype=np.float32)
        )

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None

    def get_possible_next_actions(self):
        raise NotImplementedError()

    def get_max_q_values(
        self,
        next_states: str,
        possible_next_actions,
        use_target_network: bool,
    ) -> str:
        """
        Takes in an array of next_states and outputs an array of the same shape
        whose ith entry = max_{pna} Q(state_i, pna). Uses target network for
        Q(state_i, pna) approximation.

        :param next_states: Numpy array with shape (batch_size, state_dim). Each
            row contains a representation of a state.
        :param possible_next_actions: See subclass' `get_max_q_values` documentation.
        """
        raise NotImplementedError()

    def get_q_values(
        self,
        states: str,
        actions: str,
        use_target_network: bool,
    ) -> str:
        """
        Takes in a set of next_states and corresponding next_actions. For each
        (next_state_i, next_action_i) pair, calculates Q(next_state, next_action).
        Returns these q values in a Numpy array of shape (batch_size, 1).

        :param next_states: Numpy array with shape (batch_size, state_dim). The
            ith row is a representation of the ith transition's next_state.
        :param next_actions: See subclass' `get_sarsa_values` documentation.
        """
        raise NotImplementedError()

    def update_model(
        self,
        states: str,
        actions: str,
        q_vals_target: str,
    ) -> None:
        """
        Takes in states, actions, and target q values. Updates the model:
            Runs the forward pass, computing Q(states, actions).
                Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
            Comptutes Loss of Q(states, actions) with respect to q_vals_targets.
            Updates Q Network's weights according to loss and optimizer.

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row is a representation of the ith transition's action.
        :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith
            row is the label to train against for the data from the ith transition.
        """
        raise NotImplementedError()

    def _create_reward_train_net(self) -> None:
        raise NotImplementedError()

    def _create_rl_train_net(self) -> None:
        raise NotImplementedError()

    def _create_q_score_net(self) -> None:
        self.q_score_model = ModelHelper(name="q_score_" + self.model_id)
        C2.set_model(self.q_score_model)
        self.q_score_output = self.get_q_values('states', 'actions', True)
        workspace.RunNetOnce(self.q_score_model.param_init_net)
        workspace.CreateNet(self.q_score_model.net)
        C2.set_model(None)

    def train_numpy(
        self,
        tdp: TrainingDataPage,
        evaluator: Optional[Evaluator],
    ):
        workspace.FeedBlob('states', tdp.states)
        workspace.FeedBlob('actions', tdp.actions)
        workspace.FeedBlob('rewards', tdp.rewards)
        workspace.FeedBlob('next_states', tdp.next_states)
        workspace.FeedBlob('not_terminals', tdp.not_terminals)
        if self.maxq_learning:
            if isinstance(tdp.possible_next_actions, StackedArray):
                workspace.FeedBlob(
                    'possible_next_actions', tdp.possible_next_actions.values
                )
                workspace.FeedBlob(
                    'possible_next_actions_lengths',
                    tdp.possible_next_actions.lengths
                )
            else:
                workspace.FeedBlob(
                    'possible_next_actions', tdp.possible_next_actions
                )
        else:
            workspace.FeedBlob('next_actions', tdp.next_actions)
        self.train(tdp.reward_timelines, evaluator)

    def train(
        self,
        reward_timelines: Optional[List[Dict[int, float]]],
        evaluator: Optional[Evaluator],
    ) -> None:
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None

        if self.training_iteration >= self.reward_burnin:
            if self.training_iteration == self.reward_burnin:
                logger.info(
                    "Minibatch number == reward_burnin. Starting RL updates."
                )
                self.target_network.enable_slow_updates()
            workspace.RunNet(self.rl_train_model.net)
        else:
            workspace.RunNet(self.reward_train_model.net)

        self.target_network.target_update()
        self.training_iteration += 1
        workspace.RunNet(self.q_score_model.net)
        if evaluator is not None:
            assert reward_timelines is not None
            assert self.loss_blob is not None
            evaluator.report(
                reward_timelines,
                workspace.FetchBlob(self.q_score_output),
                workspace.FetchBlob(self.loss_blob),
            )