Esempio n. 1
0
    def __init__(self,
                 state_normalization_parameters: Dict[str,
                                                      NormalizationParameters],
                 parameters: Union[DiscreteActionModelParameters,
                                   ContinuousActionModelParameters],
                 skip_normalization: Optional[bool] = False) -> None:
        print(state_normalization_parameters)
        print(parameters)

        self._state_normalization_parameters = state_normalization_parameters
        MLTrainer.__init__(self, "rl_trainer", parameters.training)

        self.target_network = TargetNetwork(self,
                                            parameters.rl.target_update_rate)

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma

        self.training_iteration = 0
        self._buffers = None
        self.minibatch_size = parameters.training.minibatch_size

        self.skip_normalization = skip_normalization
        self._prepare_state_normalization()
Esempio n. 2
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))

        assert parameters.training.layers[0] >= 0,\
            "Set layers[0] to a the number of features"

        self.num_features = parameters.training.layers[0]

        MLTrainer.__init__(self, RL_TRAINER_MODEL_ID, parameters.training)

        self.target_network = TargetNetwork(
            self, parameters.rl.target_update_rate
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob('states', np.array([0], dtype=np.float32))
        workspace.FeedBlob('actions', np.array([0], dtype=np.float32))
        workspace.FeedBlob('rewards', np.array([0], dtype=np.float32))
        workspace.FeedBlob('next_states', np.array([0], dtype=np.float32))
        workspace.FeedBlob('not_terminals', np.array([0], dtype=np.float32))
        workspace.FeedBlob('next_actions', np.array([0], dtype=np.float32))
        workspace.FeedBlob(
            'possible_next_actions', np.array([0], dtype=np.float32)
        )
        workspace.FeedBlob(
            'possible_next_actions_lengths', np.array([0], dtype=np.float32)
        )

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None
Esempio n. 3
0
    def __init__(
        self,
        fc_parameters: DiscreteActionModelParameters,
        cnn_parameters: CNNModelParameters,
        img_height: int,
        img_width: int,
    ) -> None:
        MLConvTrainer.__init__(self, "ml_conv_trainer", fc_parameters.training,
                               cnn_parameters, img_height, img_width)

        self.target_network = TargetNetwork(
            self, fc_parameters.rl.target_update_rate)

        self.reward_burnin = fc_parameters.rl.reward_burnin
        self.maxq_learning = fc_parameters.rl.maxq_learning
        self.rl_discount_rate = fc_parameters.rl.gamma

        self.training_iteration = 0
        self._buffers = None
        self.minibatch_size = fc_parameters.training.minibatch_size

        self.skip_normalization = True
Esempio n. 4
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))

        assert parameters.training.layers[0] >= 0,\
            "Set layers[0] to a the number of features"

        self.num_features = parameters.training.layers[0]

        MLTrainer.__init__(self, "rl_trainer", parameters.training)

        self.target_network = TargetNetwork(self,
                                            parameters.rl.target_update_rate)

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma

        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
Esempio n. 5
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))
        RLTrainer.num_trainers += 1
        self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers)

        if parameters.training.cnn_parameters is not None:
            self.conv_ml_trainer = ConvMLTrainer(
                CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
            )

            # The final layer of the conv net is the input to the fc net.
            parameters.training.layers[
                0] = self.conv_ml_trainer.get_output_size()

            self.conv_target_network = ConvTargetNetwork(
                CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
                parameters.rl.target_update_rate,
                self.conv_ml_trainer,
            )
        else:
            self.conv_ml_trainer = None
            self.conv_target_network = None

        assert (parameters.training.layers[0] >=
                0), "Set layers[0] to a the number of features"

        self.ml_trainer = MLTrainer(
            ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
            parameters.training)

        self.target_network = TargetNetwork(
            TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
            parameters.training,
            parameters.rl.target_update_rate,
            self.ml_trainer,
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob("states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("actions", np.array([0], dtype=np.float32))
        workspace.FeedBlob("rewards", np.array([0], dtype=np.float32))
        workspace.FeedBlob("next_states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32))
        if self.maxq_learning:
            workspace.FeedBlob("possible_next_actions",
                               np.array([0], dtype=np.float32))
            workspace.FeedBlob("possible_next_actions_lengths",
                               np.array([0], dtype=np.float32))
        else:
            workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32))
        # Setting to 1 serves as a 1 unit time_diff if not set by user
        workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32))

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None