Example #1
0
class FullyConnected(ValueNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    use_layer_norm: bool = False

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    def build_value_network(self,
                            state_normalization_data: NormalizationData,
                            output_dim: int = 1) -> torch.nn.Module:
        state_dim = get_num_output_features(
            state_normalization_data.dense_normalization_parameters)
        return FloatFeatureFullyConnected(
            state_dim=state_dim,
            output_dim=output_dim,
            sizes=self.sizes,
            activations=self.activations,
            use_layer_norm=self.use_layer_norm,
        )
class DuelingQuantile(QRDQNNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])

    def __post_init_post_parse__(self):
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    def build_q_network(
        self,
        state_normalization_data: NormalizationData,
        output_dim: int,
        num_atoms: int,
    ) -> ModelBase:
        state_dim = self._get_input_dim(state_normalization_data)
        return DuelingQNetwork.make_fully_connected(
            state_dim,
            output_dim,
            layers=self.sizes,
            activations=self.activations,
            num_atoms=num_atoms,
        )
Example #3
0
    def __init__(
        self,
        actor_network,
        q1_network,
        q2_network=None,
        # Start TD3TrainerParameters
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        q_network_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
        actor_network_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
        minibatch_size: int = 64,
        noise_variance: float = 0.2,
        noise_clip: float = 0.5,
        delayed_policy_update: int = 2,
        minibatches_per_step: int = 1,
    ) -> None:
        """
        Args:
            actor_network: states -> actions, trained to maximize value
            q1_network: states, action -> q-value
            q2_network (optional): double q-learning to stabilize training
                from overestimation bias
            rl (optional): an instance of the RLParameter class, which
                defines relevant hyperparameters
            q_network_optimizer (optional): the optimizer class and
                optimizer hyperparameters for the q network(s) optimizer
            actor_network_optimizer (optional): see q_network_optimizer
            minibatch_size (optional): the size of the minibatch
            noise_variance (optional): the variance of action noise added to smooth
                q-value estimates
            noise_clip (optional): the maximum absolute value of action noise added
                to smooth q-value estimates
            delayed_policy_update (optional): the ratio of q network updates
                to target and policy network updates
            minibatches_per_step (optional, TODO: currently unused): the number of minibatch updates
                per training step
        """
        super().__init__()
        self.rl_parameters = rl
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step or 1

        self.q1_network = q1_network
        self.q1_network_target = copy.deepcopy(self.q1_network)
        self.q_network_optimizer = q_network_optimizer

        self.q2_network = q2_network
        if self.q2_network is not None:
            self.q2_network_target = copy.deepcopy(self.q2_network)

        self.actor_network = actor_network
        self.actor_network_target = copy.deepcopy(self.actor_network)
        self.actor_network_optimizer = actor_network_optimizer

        self.noise_variance = noise_variance
        self.noise_clip_range = (-noise_clip, noise_clip)
        self.delayed_policy_update = delayed_policy_update
Example #4
0
class FullyConnected(ValueNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    use_layer_norm: bool = False

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    # pyre-fixme[14]: `build_value_network` overrides method defined in
    #  `ValueNetBuilder` inconsistently.
    def build_value_network(
            self,
            state_normalization_data: NormalizationData) -> torch.nn.Module:
        state_dim = get_num_output_features(
            state_normalization_data.dense_normalization_parameters)
        return FullyConnectedNetwork(
            [state_dim] + self.sizes + [1],
            self.activations + ["linear"],
            use_layer_norm=self.use_layer_norm,
        )
Example #5
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        use_gpu: bool = False,
        # Start ParametricDQNTrainerParameters
        rl: rlp.RLParameters = field(
            default_factory=rlp.RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
    ) -> None:
        super().__init__(rl, use_gpu=use_gpu)

        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step or 1

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            self.q_network.parameters())

        self.reward_network = reward_network
        self.reward_network_optimizer = optimizer.make_optimizer(
            self.reward_network.parameters())
Example #6
0
class FullyConnected(ParametricDQNNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [128, 64])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    use_batch_norm: bool = False
    use_layer_norm: bool = False

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    def build_q_network(
        self,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
        output_dim: int = 1,
    ) -> ModelBase:
        state_dim = get_num_output_features(state_normalization_parameters)
        action_dim = get_num_output_features(action_normalization_parameters)
        return FullyConnectedCritic(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=self.sizes,
            activations=self.activations,
            use_batch_norm=self.use_batch_norm,
            use_layer_norm=self.use_layer_norm,
            output_dim=output_dim,
        )
Example #7
0
class FullyConnected(DiscreteActorNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [128, 64])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    use_batch_norm: bool = False
    use_layer_norm: bool = False
    action_activation: str = "tanh"
    exploration_variance: Optional[float] = None

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    def build_actor(
        self,
        state_normalization_data: NormalizationData,
        num_actions: int,
    ) -> ModelBase:
        state_dim = get_num_output_features(
            state_normalization_data.dense_normalization_parameters)
        return FullyConnectedActor(
            state_dim=state_dim,
            action_dim=num_actions,
            sizes=self.sizes,
            activations=self.activations,
            use_batch_norm=self.use_batch_norm,
            action_activation=self.action_activation,
            exploration_variance=self.exploration_variance,
        )
Example #8
0
    def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        minibatch_size: int = 1024,
        parameters: Seq2SlateParameters = field(  # noqa: B008
            default_factory=Seq2SlateParameters),
        baseline_net: Optional[BaselineNet] = None,
        baseline_warmup_num_batches: int = 0,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        baseline_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
    ) -> None:
        self.seq2slate_net = seq2slate_net
        self.parameters = parameters
        self.use_gpu = use_gpu

        self.minibatch_size = minibatch_size
        self.minibatch = 0

        self.baseline_net = baseline_net
        self.baseline_warmup_num_batches = baseline_warmup_num_batches

        self.rl_opt = policy_optimizer.make_optimizer(
            self.seq2slate_net.parameters())
        if self.baseline_net:
            self.baseline_opt = baseline_optimizer.make_optimizer(
                # pyre-fixme[16]: `Optional` has no attribute `parameters`.
                self.baseline_net.parameters())

        assert (self.parameters.importance_sampling_clamp_max is None
                or not self.parameters.on_policy), (
                    "importance_sampling_clamp_max is not useful and should "
                    "be set to None in on-policy learning")
Example #9
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        # Start ParametricDQNTrainerParameters
        rl: rlp.RLParameters = field(
            default_factory=rlp.RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
    ) -> None:
        super().__init__()
        self.rl_parameters = rl

        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step or 1

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.reward_network = reward_network
        self.optimizer = optimizer

        if rl.q_network_loss == "mse":
            self.q_network_loss = F.mse_loss
        elif rl.q_network_loss == "huber":
            self.q_network_loss = F.smooth_l1_loss
        else:
            raise Exception("Q-Network loss type {} not valid loss.".format(
                rl.q_network_loss))
Example #10
0
    def __init__(
        self,
        q_network,
        q_network_target,
        use_gpu: bool = False,
        # Start SlateQTrainerParameters
        rl: rlp.RLParameters = field(  # noqa: B008
            default_factory=lambda: rlp.RLParameters(maxq_learning=False)),
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        single_selection: bool = True,
        minibatch_size: int = 1024,
        evaluation: rlp.EvaluationParameters = field(  # noqa: B008
            default_factory=lambda: rlp.EvaluationParameters(
                calc_cpe_in_training=False)),
    ) -> None:
        super().__init__(rl, use_gpu=use_gpu)
        self.minibatches_per_step = 1
        self.minibatch_size = minibatch_size
        self.single_selection = single_selection

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            self.q_network.parameters())
Example #11
0
class SingleStepSyntheticReward(SyntheticRewardNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    last_layer_activation: str = "sigmoid"
    use_batch_norm: bool = False
    use_layer_norm: bool = False

    def build_synthetic_reward_network(
        self,
        state_normalization_data: NormalizationData,
        action_normalization_data: Optional[NormalizationData] = None,
        discrete_action_names: Optional[List[str]] = None,
    ) -> ModelBase:
        state_dim = get_num_output_features(
            state_normalization_data.dense_normalization_parameters)
        if not discrete_action_names:
            assert action_normalization_data is not None
            action_dim = get_num_output_features(
                action_normalization_data.dense_normalization_parameters)
        else:
            action_dim = len(discrete_action_names)
        net = SingleStepSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=self.sizes,
            activations=self.activations,
            last_layer_activation=self.last_layer_activation,
            use_batch_norm=self.use_batch_norm,
            use_layer_norm=self.use_layer_norm,
        )
        return SyntheticRewardNet(net)
Example #12
0
class Categorical(CategoricalDQNNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    def build_q_network(
        self,
        state_normalization_data: NormalizationData,
        output_dim: int,
        num_atoms: int,
        qmin: int,
        qmax: int,
    ) -> ModelBase:
        state_dim = self._get_input_dim(state_normalization_data)
        distributional_network = FullyConnectedDQN(
            state_dim=state_dim,
            action_dim=output_dim,
            num_atoms=num_atoms,
            sizes=self.sizes,
            activations=self.activations,
            use_batch_norm=False,
            dropout_ratio=0.0,
        )
        return CategoricalDQN(distributional_network,
                              qmin=qmin,
                              qmax=qmax,
                              num_atoms=num_atoms)
Example #13
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        rl: rlp.RLParameters = field(
            default_factory=rlp.RLParameters),  # noqa B008
        double_q_learning: bool = True,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        optimizer: rlp.OptimizerParameters = field(  # noqa B008
            default_factory=rlp.OptimizerParameters),
        use_gpu: bool = False,
    ) -> None:
        super().__init__(rl, use_gpu=use_gpu)

        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step or 1

        self.q_network = q_network
        self.q_network_target = q_network_target
        self._set_optimizer(optimizer.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(),
            lr=optimizer.learning_rate,
            weight_decay=optimizer.l2_decay,
        )

        self.reward_network = reward_network
        self.reward_network_optimizer = self.optimizer_func(
            self.reward_network.parameters(),
            lr=optimizer.learning_rate,
            weight_decay=optimizer.l2_decay,
        )
Example #14
0
class Categorical(CategoricalDQNNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    num_atoms: int = 51
    qmin: int = -100
    qmax: int = 200

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}"
        )

    def build_q_network(
        self,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        output_dim: int,
    ) -> ModelBase:
        state_dim = self._get_input_dim(state_normalization_parameters)
        return CategoricalDQN(
            state_dim,
            action_dim=output_dim,
            num_atoms=self.num_atoms,
            qmin=self.qmin,
            qmax=self.qmax,
            sizes=self.sizes,
            activations=self.activations,
            use_batch_norm=False,
            dropout_ratio=0.0,
            use_gpu=False,
        )
Example #15
0
    def __init__(
        self,
        q_network,
        q_network_target,
        # Start SlateQTrainerParameters
        rl: rlp.RLParameters = field(  # noqa: B008
            default_factory=lambda: rlp.RLParameters(maxq_learning=False)),
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        single_selection: bool = True,
        minibatch_size: int = 1024,
        evaluation: rlp.EvaluationParameters = field(  # noqa: B008
            default_factory=lambda: rlp.EvaluationParameters(
                calc_cpe_in_training=False)),
    ) -> None:
        """
        Args:
            q_network: states, action -> q-value
            rl (optional): an instance of the RLParameter class, which
                defines relevant hyperparameters
            optimizer (optional): the optimizer class and
                optimizer hyperparameters for the q network(s) optimizer
            single_selection (optional): TBD
            minibatch_size (optional): the size of the minibatch
            evaluation (optional): TBD
        """
        super().__init__()
        self.rl_parameters = rl

        self.single_selection = single_selection

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer
Example #16
0
class DQNTrainerParameters:
    __hash__ = rlp.param_hash

    actions: List[str] = field(default_factory=list)
    rl: rlp.RLParameters = field(default_factory=rlp.RLParameters)
    double_q_learning: bool = True
    bcq: Optional[BCQConfig] = None
    minibatch_size: int = 1024
    minibatches_per_step: int = 1
    optimizer: rlp.OptimizerParameters = field(
        default_factory=rlp.OptimizerParameters)
    evaluation: rlp.EvaluationParameters = field(
        default_factory=rlp.EvaluationParameters)

    @classmethod
    def from_discrete_action_model_parameters(
            cls, params: DiscreteActionModelParameters):
        return cls(
            actions=params.actions,
            rl=params.rl,
            double_q_learning=params.rainbow.double_q_learning,
            bcq=BCQConfig(drop_threshold=params.rainbow.bcq_drop_threshold)
            if params.rainbow.bcq else None,
            minibatch_size=params.training.minibatch_size,
            minibatches_per_step=params.training.minibatches_per_step,
            optimizer=rlp.OptimizerParameters(
                optimizer=params.training.optimizer,
                learning_rate=params.training.learning_rate,
                l2_decay=params.training.l2_decay,
            ),
            evaluation=params.evaluation,
        )
Example #17
0
class FullyConnected(DiscreteDQNNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    dropout_ratio: float = 0.0
    use_batch_norm: bool = False

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    def build_q_network(
        self,
        state_feature_config: rlt.ModelFeatureConfig,
        state_normalization_data: NormalizationData,
        output_dim: int,
    ) -> ModelBase:
        state_dim = self._get_input_dim(state_normalization_data)
        return FullyConnectedDQN(
            state_dim=state_dim,
            action_dim=output_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
            use_batch_norm=self.use_batch_norm,
        )
class DirichletFullyConnected(ContinuousActorNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [128, 64])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    use_batch_norm: bool = False

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    @property
    def default_action_preprocessing(self) -> str:
        return DO_NOT_PREPROCESS

    def build_actor(
        self,
        state_normalization_data: NormalizationData,
        action_normalization_data: NormalizationData,
    ) -> ModelBase:
        state_dim = get_num_output_features(
            state_normalization_data.dense_normalization_parameters)
        action_dim = get_num_output_features(
            action_normalization_data.dense_normalization_parameters)
        return DirichletFullyConnectedActor(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=self.sizes,
            activations=self.activations,
            use_batch_norm=self.use_batch_norm,
        )
Example #19
0
class Quantile(QRDQNNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    dropout_ratio: float = 0.0

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    def build_q_network(
        self,
        state_normalization_data: NormalizationData,
        output_dim: int,
        num_atoms: int,
    ) -> ModelBase:
        state_dim = self._get_input_dim(state_normalization_data)
        return FullyConnectedDQN(
            state_dim=state_dim,
            action_dim=output_dim,
            sizes=self.sizes,
            num_atoms=num_atoms,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
Example #20
0
class Seq2RewardModel(WorldModelBase):
    __hash__ = param_hash
    net_builder: ValueNetBuilder__Union = field(
        # pyre-fixme[28]: Unexpected keyword argument `Seq2RewardNetBuilder`.
        # pyre-fixme[28]: Unexpected keyword argument `Seq2RewardNetBuilder`.
        default_factory=lambda: ValueNetBuilder__Union(
            Seq2RewardNetBuilder=Seq2RewardNetBuilder()
        )
    )

    trainer_param: Seq2RewardTrainerParameters = field(
        default_factory=Seq2RewardTrainerParameters
    )

    def build_trainer(self) -> Seq2RewardTrainer:
        seq2reward_network = self.net_builder.value.build_value_network(
            self.state_normalization_data, self.action_normalization_data
        )

        if self.use_gpu:
            seq2reward_network = seq2reward_network.cuda()

        return Seq2RewardTrainer(
            seq2reward_network=seq2reward_network, params=self.trainer_param
        )

    def build_serving_module(self) -> torch.nn.Module:
        """
        Returns a TorchScript predictor module
        """
        raise NotImplementedError()
Example #21
0
    def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        reward_net_path: str,
        minibatch_size: int,
        parameters: Seq2SlateParameters,
        baseline_net: Optional[BaselineNet] = None,
        baseline_warmup_num_batches: int = 0,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
        baseline_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
    ) -> None:
        self.reward_net_path = reward_net_path
        # loaded when used
        self.reward_net = None
        self.parameters = parameters
        self.minibatch_size = minibatch_size
        self.use_gpu = use_gpu
        self.device = torch.device("cuda") if use_gpu else torch.device("cpu")
        self.permutation_index = torch.tensor(
            list(
                permutations(
                    # pyre-fixme[6]: Expected `Iterable[Variable[itertools._T]]` for
                    #  1st param but got `Tensor`.
                    torch.arange(seq2slate_net.max_src_seq_len),
                    seq2slate_net.max_tgt_seq_len,
                )
            ),
            device=self.device,
        ).long()

        if self.parameters.simulation_distance_penalty is not None:
            # pyre-fixme[16]: `Optional` has no attribute `__gt__`.
            assert self.parameters.simulation_distance_penalty > 0
            self.permutation_distance = (
                torch.tensor(
                    [swap_dist(x.tolist()) for x in self.permutation_index],
                    device=self.device,
                )
                .unsqueeze(1)
                .float()
            )
            self.MAX_DISTANCE = torch.max(self.permutation_distance)

        self.trainer = Seq2SlateTrainer(
            seq2slate_net,
            minibatch_size,
            self.parameters,
            baseline_net=baseline_net,
            baseline_warmup_num_batches=baseline_warmup_num_batches,
            use_gpu=use_gpu,
            policy_optimizer=policy_optimizer,
            baseline_optimizer=baseline_optimizer,
        )
        self.seq2slate_net = self.trainer.seq2slate_net
        self.baseline_net = self.trainer.baseline_net
Example #22
0
class Seq2RewardModel(WorldModelBase):
    __hash__ = param_hash
    net_builder: ValueNetBuilder__Union = field(
        # pyre-fixme[28]: Unexpected keyword argument `Seq2RewardNetBuilder`.
        # pyre-fixme[28]: Unexpected keyword argument `Seq2RewardNetBuilder`.
        default_factory=lambda: ValueNetBuilder__Union(Seq2RewardNetBuilder=
                                                       Seq2RewardNetBuilder()))

    compress_net_builder: ValueNetBuilder__Union = field(
        # pyre-fixme[28]: Unexpected keyword argument `FullyConnected`.
        # pyre-fixme[28]: Unexpected keyword argument `FullyConnected`.
        default_factory=lambda: ValueNetBuilder__Union(FullyConnected=
                                                       FullyConnected()))

    trainer_param: Seq2RewardTrainerParameters = field(
        default_factory=Seq2RewardTrainerParameters)

    preprocessing_options: Optional[PreprocessingOptions] = None

    def build_trainer(
        self,
        normalization_data_map: Dict[str, NormalizationData],
        use_gpu: bool,
        reward_options: Optional[RewardOptions] = None,
    ) -> Seq2RewardTrainer:
        seq2reward_network = self.net_builder.value.build_value_network(
            normalization_data_map[NormalizationKey.STATE])
        trainer = Seq2RewardTrainer(seq2reward_network=seq2reward_network,
                                    params=self.trainer_param)
        return trainer

    def get_reporter(self) -> Seq2RewardReporter:
        return Seq2RewardReporter(self.trainer_param.action_names)
Example #23
0
    def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        minibatch_size: int = 1024,
        parameters: Seq2SlateParameters = field(  # noqa: B008
            default_factory=Seq2SlateParameters),
        baseline_net: Optional[BaselineNet] = None,
        baseline_warmup_num_batches: int = 0,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        baseline_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        print_interval: int = 100,
    ) -> None:
        self.seq2slate_net = seq2slate_net
        self.parameters = parameters
        self.use_gpu = use_gpu
        self.print_interval = print_interval

        self.minibatch_size = minibatch_size
        self.minibatch = 0

        self.baseline_net = baseline_net
        self.baseline_warmup_num_batches = baseline_warmup_num_batches

        self.rl_opt = policy_optimizer.make_optimizer(
            self.seq2slate_net.parameters())
        if self.baseline_net:
            self.baseline_opt = baseline_optimizer.make_optimizer(
                # pyre-fixme[16]: `Optional` has no attribute `parameters`.
                self.baseline_net.parameters())
Example #24
0
    def __init__(
        self,
        q_network,
        q_network_target,
        metrics_to_score=None,
        reward_network=None,
        q_network_cpe=None,
        q_network_cpe_target=None,
        loss_reporter=None,
        use_gpu: bool = False,
        actions: List[str] = field(default_factory=list),  # noqa: B008
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        num_atoms: int = 51,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        cpe_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        evaluation: EvaluationParameters = field(  # noqa: B008
            default_factory=EvaluationParameters),
    ) -> None:
        super().__init__(
            rl,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            actions=actions,
            evaluation_parameters=evaluation,
            loss_reporter=loss_reporter,
        )

        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step
        self._actions = actions

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            self.q_network.parameters())

        self.num_atoms = num_atoms
        self.quantiles = (
            (0.5 + torch.arange(self.num_atoms, device=self.device).float()) /
            float(self.num_atoms)).view(1, -1)

        self._initialize_cpe(reward_network,
                             q_network_cpe,
                             q_network_cpe_target,
                             optimizer=cpe_optimizer)

        self.reward_boosts = torch.zeros([1, len(self._actions)],
                                         device=self.device)
        if rl.reward_boost is not None:
            # pyre-fixme[16]: Optional type has no attribute `keys`.
            for k in rl.reward_boost.keys():
                i = self._actions.index(k)
                # pyre-fixme[16]: Optional type has no attribute `__getitem__`.
                self.reward_boosts[0, i] = rl.reward_boost[k]
Example #25
0
class C51TrainerParameters:
    __hash__ = rlp.param_hash

    actions: List[str] = field(default_factory=list)
    rl: rlp.RLParameters = field(default_factory=rlp.RLParameters)
    double_q_learning: bool = True
    minibatch_size: int = 1024
    minibatches_per_step: int = 1
    num_atoms: int = 51
    qmin: float = -100
    qmax: float = 200
    optimizer: rlp.OptimizerParameters = field(
        default_factory=rlp.OptimizerParameters)
    evaluation: rlp.EvaluationParameters = field(
        default_factory=rlp.EvaluationParameters)

    @classmethod
    def from_discrete_action_model_parameters(
            cls, params: DiscreteActionModelParameters):
        return cls(
            actions=params.actions,
            rl=params.rl,
            double_q_learning=params.rainbow.double_q_learning,
            minibatch_size=params.training.minibatch_size,
            minibatches_per_step=params.training.minibatches_per_step,
            num_atoms=params.rainbow.num_atoms,
            qmin=params.rainbow.qmin,
            qmax=params.rainbow.qmax,
            optimizer=rlp.OptimizerParameters(
                optimizer=params.training.optimizer,
                learning_rate=params.training.learning_rate,
                l2_decay=params.rainbow.c51_l2_decay,
            ),
            evaluation=params.evaluation,
        )
Example #26
0
class FullyConnectedWithEmbedding(DiscreteDQNWithIdListNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    embedding_dim: int = 64
    dropout_ratio: float = 0.0

    def __post_init_post_parse__(self):
        super().__init__()
        assert len(self.sizes) == len(self.activations), (
            f"Must have the same numbers of sizes and activations; got: "
            f"{self.sizes}, {self.activations}")

    def build_q_network(
        self,
        state_feature_config: rlt.ModelFeatureConfig,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        output_dim: int,
    ) -> ModelBase:
        state_dim = self._get_input_dim(state_normalization_parameters)
        return FullyConnectedDQNWithEmbedding(
            state_dim=state_dim,
            action_dim=output_dim,
            sizes=self.sizes,
            activations=self.activations,
            model_feature_config=state_feature_config,
            embedding_dim=self.embedding_dim,
            dropout_ratio=self.dropout_ratio,
        )
Example #27
0
 def __init__(
     self,
     seq2slate_net: Seq2SlateTransformerNet,
     params: Seq2SlateParameters = field(  # noqa: B008
         default_factory=Seq2SlateParameters),
     baseline_net: Optional[BaselineNet] = None,
     baseline_warmup_num_batches: int = 0,
     policy_optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
     baseline_optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
     policy_gradient_interval: int = 1,
     print_interval: int = 100,
     calc_cpe: bool = False,
     reward_network: Optional[nn.Module] = None,
 ) -> None:
     super().__init__(
         seq2slate_net,
         params=params,
         baseline_net=baseline_net,
         baseline_warmup_num_batches=baseline_warmup_num_batches,
         policy_optimizer=policy_optimizer,
         baseline_optimizer=baseline_optimizer,
         policy_gradient_interval=policy_gradient_interval,
         print_interval=print_interval,
         calc_cpe=calc_cpe,
         reward_network=reward_network,
     )
     self.sim_param = params.simulation
     assert self.sim_param is not None
     # loaded when used
     self.reward_name_and_net = nn.ModuleDict({})
     self.MAX_DISTANCE = (seq2slate_net.max_src_seq_len *
                          (seq2slate_net.max_src_seq_len - 1) / 2)
Example #28
0
class ParametricDQN(ParametricDQNBase):
    __hash__ = param_hash

    trainer_param: ParametricDQNTrainerParameters = field(
        default_factory=ParametricDQNTrainerParameters)
    net_builder: ParametricDQNNetBuilder__Union = field(
        # pyre-fixme[28]: Unexpected keyword argument `FullyConnected`.
        default_factory=lambda: ParametricDQNNetBuilder__Union(
            FullyConnected=FullyConnected()))

    @property
    def rl_parameters(self):
        return self.trainer_param.rl

    def build_trainer(
        self,
        normalization_data_map: Dict[str, NormalizationData],
        use_gpu: bool,
        reward_options: Optional[RewardOptions] = None,
    ) -> ParametricDQNTrainer:
        net_builder = self.net_builder.value
        # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`.
        self._q_network = net_builder.build_q_network(
            normalization_data_map[NormalizationKey.STATE],
            normalization_data_map[NormalizationKey.ACTION],
        )
        # Metrics + reward
        reward_options = reward_options or RewardOptions()
        metrics_to_score = get_metrics_to_score(
            reward_options.metric_reward_values)
        reward_output_dim = len(metrics_to_score) + 1
        reward_network = net_builder.build_q_network(
            normalization_data_map[NormalizationKey.STATE],
            normalization_data_map[NormalizationKey.ACTION],
            output_dim=reward_output_dim,
        )

        q_network_target = self._q_network.get_target_network()
        return ParametricDQNTrainer(
            q_network=self._q_network,
            q_network_target=q_network_target,
            reward_network=reward_network,
            # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute
            #  `asdict`.
            **self.trainer_param.asdict(),
        )

    def build_serving_module(
        self,
        trainer_module: ReAgentLightningModule,
        normalization_data_map: Dict[str, NormalizationData],
    ) -> torch.nn.Module:
        assert isinstance(trainer_module, ParametricDQNTrainer)
        net_builder = self.net_builder.value
        return net_builder.build_serving_module(
            trainer_module.q_network,
            normalization_data_map[NormalizationKey.STATE],
            normalization_data_map[NormalizationKey.ACTION],
        )
Example #29
0
class NGramConvNetSyntheticReward(SyntheticRewardNetBuilder):
    __hash__ = param_hash

    sizes: List[int] = field(default_factory=lambda: [256, 128])
    activations: List[str] = field(default_factory=lambda: ["relu", "relu"])
    last_layer_activation: str = "sigmoid"
    context_size: int = 3
    conv_net_params: ConvNetParameters = field(
        default_factory=lambda: ConvNetParameters(
            conv_dims=[256, 128],
            conv_height_kernels=[1, 1],
            pool_types=["max", "max"],
            pool_kernel_sizes=[1, 1],
        ))

    def build_synthetic_reward_network(
        self,
        state_normalization_data: NormalizationData,
        action_normalization_data: Optional[NormalizationData] = None,
        discrete_action_names: Optional[List[str]] = None,
    ) -> ModelBase:
        state_dim = get_num_output_features(
            state_normalization_data.dense_normalization_parameters)

        if not discrete_action_names:
            assert action_normalization_data is not None
            action_dim = get_num_output_features(
                action_normalization_data.dense_normalization_parameters)
        else:
            action_dim = len(discrete_action_names)

        conv_net = synthetic_reward.NGramConvolutionalNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=self.sizes,
            activations=self.activations,
            last_layer_activation=self.last_layer_activation,
            context_size=self.context_size,
            conv_net_params=self.conv_net_params,
        )
        return NGramSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            context_size=self.context_size,
            net=conv_net,
        )

    def build_serving_module(
        self,
        synthetic_reward_network: ModelBase,
        state_normalization_data: NormalizationData,
        action_normalization_data: Optional[NormalizationData] = None,
        discrete_action_names: Optional[List[str]] = None,
    ) -> torch.nn.Module:
        """
        Returns a TorchScript predictor module
        """
        raise NotImplementedError(
            "N-gram Synthetic Reward Predictor has not been implemented")
Example #30
0
class DiscreteC51DQN(DiscreteDQNBase):
    __hash__ = param_hash

    trainer_param: C51TrainerParameters = field(default_factory=C51TrainerParameters)
    net_builder: CategoricalDQNNetBuilder__Union = field(
        default_factory=lambda: CategoricalDQNNetBuilder__Union(
            Categorical=Categorical()
        )
    )
    cpe_net_builder: CategoricalDQNNetBuilder__Union = field(
        default_factory=lambda: CategoricalDQNNetBuilder__Union(
            Categorical=Categorical()
        )
    )

    def __post_init_post_parse__(self):
        super().__post_init_post_parse__()
        self.rl_parameters = self.trainer_param.rl
        self.eval_parameters = self.trainer_param.evaluation
        self.action_names = self.trainer_param.actions
        assert len(self.action_names) > 1, "DiscreteC51DQN needs at least 2 actions"
        assert (
            self.trainer_param.minibatch_size % 8 == 0
        ), "The minibatch size must be divisible by 8 for performance reasons."

    def build_trainer(self) -> C51Trainer:
        net_builder = self.net_builder.value
        q_network = net_builder.build_q_network(
            self.state_normalization_parameters, len(self.action_names)
        )

        if self.use_gpu:
            q_network = q_network.cuda()

        q_network_target = q_network.get_target_network()

        self._q_network = q_network

        return C51Trainer(
            q_network,
            q_network_target,
            self.trainer_param,
            self.use_gpu,
            metrics_to_score=self.metrics_to_score,
            loss_reporter=NoOpLossReporter(),
        )

    def build_serving_module(self) -> torch.nn.Module:
        """
        Returns a TorchScript predictor module
        """
        assert self._q_network is not None, "_q_network was not initialized"
        net_builder = self.net_builder.value
        return net_builder.build_serving_module(
            self._q_network,
            self.state_normalization_parameters,
            action_names=self.action_names,
            state_feature_config=self.state_feature_config,
        )