Beispiel #1
0
 def metrics_to_score(self) -> List[str]:
     assert self.reward_options is not None
     if self._metrics_to_score is None:
         # pyre-fixme[16]: `ParametricDQNBase` has no attribute `_metrics_to_score`.
         self._metrics_to_score = get_metrics_to_score(
             self._reward_options.metric_reward_values)
     return self._metrics_to_score
Beispiel #2
0
    def setUp(self):
        # preparing various components for qr-dqn trainer initialization
        self.batch_size = 3
        self.state_dim = 10
        self.action_dim = 2
        self.num_layers = 2
        self.sizes = [20 for _ in range(self.num_layers)]
        self.activations = ["relu" for _ in range(self.num_layers)]
        self.use_layer_norm = False
        self.softmax_temperature = 1

        self.actions = [str(i) for i in range(self.action_dim)]
        self.params = PPOTrainerParameters(actions=self.actions, normalize=False)
        self.reward_options = RewardOptions()
        self.metrics_to_score = get_metrics_to_score(
            self.reward_options.metric_reward_values
        )

        self.policy_network = DuelingQNetwork.make_fully_connected(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            layers=self.sizes,
            activations=self.activations,
        )
        self.sampler = SoftmaxActionSampler(temperature=self.softmax_temperature)
        self.policy = Policy(scorer=self.policy_network, sampler=self.sampler)

        self.value_network = FloatFeatureFullyConnected(
            state_dim=self.state_dim,
            output_dim=1,
            sizes=self.sizes,
            activations=self.activations,
            use_layer_norm=self.use_layer_norm,
        )
 def metrics_to_score(self) -> List[str]:
     assert self.reward_options is not None
     if self._metrics_to_score is None:
         self._metrics_to_score = get_metrics_to_score(
             self._reward_options.metric_reward_values
         )
     return self._metrics_to_score
Beispiel #4
0
 def metrics_to_score(self) -> List[str]:
     assert self._reward_options is not None
     # pyre-fixme[16]: `ActorCriticBase` has no attribute `_metrics_to_score`.
     if self._metrics_to_score is None:
         self._metrics_to_score = get_metrics_to_score(
             self._reward_options.metric_reward_values)
     return self._metrics_to_score
    def build_trainer(
        self,
        normalization_data_map: Dict[str, NormalizationData],
        use_gpu: bool,
        reward_options: Optional[RewardOptions] = None,
    ) -> ParametricDQNTrainer:
        net_builder = self.net_builder.value
        # pyre-fixme[16]: `ParametricDQN` has no attribute `_q_network`.
        self._q_network = net_builder.build_q_network(
            normalization_data_map[NormalizationKey.STATE],
            normalization_data_map[NormalizationKey.ACTION],
        )
        # Metrics + reward
        reward_options = reward_options or RewardOptions()
        metrics_to_score = get_metrics_to_score(
            reward_options.metric_reward_values)
        reward_output_dim = len(metrics_to_score) + 1
        reward_network = net_builder.build_q_network(
            normalization_data_map[NormalizationKey.STATE],
            normalization_data_map[NormalizationKey.ACTION],
            output_dim=reward_output_dim,
        )

        q_network_target = self._q_network.get_target_network()
        return ParametricDQNTrainer(
            q_network=self._q_network,
            q_network_target=q_network_target,
            reward_network=reward_network,
            # pyre-fixme[16]: `ParametricDQNTrainerParameters` has no attribute
            #  `asdict`.
            **self.trainer_param.asdict(),
        )
Beispiel #6
0
    def build_trainer(
        self,
        normalization_data_map: Dict[str, NormalizationData],
        use_gpu: bool,
        reward_options: Optional[RewardOptions] = None,
    ) -> QRDQNTrainer:
        net_builder = self.net_builder.value
        q_network = net_builder.build_q_network(
            normalization_data_map[NormalizationKey.STATE],
            len(self.action_names),
            # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `num_atoms`.
            num_atoms=self.trainer_param.num_atoms,
        )

        q_network_target = q_network.get_target_network()

        reward_options = reward_options or RewardOptions()
        metrics_to_score = get_metrics_to_score(
            reward_options.metric_reward_values)

        reward_network, q_network_cpe, q_network_cpe_target = None, None, None
        if self.eval_parameters.calc_cpe_in_training:
            # Metrics + reward
            num_output_nodes = (len(metrics_to_score) + 1) * len(
                # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`.
                self.trainer_param.actions)

            cpe_net_builder = self.cpe_net_builder.value
            reward_network = cpe_net_builder.build_q_network(
                self.state_feature_config,
                normalization_data_map[NormalizationKey.STATE],
                num_output_nodes,
            )
            q_network_cpe = cpe_net_builder.build_q_network(
                self.state_feature_config,
                normalization_data_map[NormalizationKey.STATE],
                num_output_nodes,
            )

            q_network_cpe_target = q_network_cpe.get_target_network()

        trainer = QRDQNTrainer(
            q_network=q_network,
            q_network_target=q_network_target,
            reward_network=reward_network,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
            metrics_to_score=metrics_to_score,
            evaluation=self.eval_parameters,
            # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `asdict`.
            **self.trainer_param.asdict(),
        )
        return trainer
Beispiel #7
0
 def setUp(self):
     # preparing various components for qr-dqn trainer initialization
     self.params = QRDQNTrainerParameters(actions=["1", "2"], num_atoms=11)
     self.reward_options = RewardOptions()
     self.metrics_to_score = get_metrics_to_score(
         self.reward_options.metric_reward_values
     )
     self.state_dim = 10
     self.action_dim = 2
     self.sizes = [20, 20]
     self.num_atoms = 11
     self.activations = ["relu", "relu"]
     self.dropout_ratio = 0
     self.q_network = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.action_dim,
         sizes=self.sizes,
         num_atoms=self.num_atoms,
         activations=self.activations,
         dropout_ratio=self.dropout_ratio,
     )
     self.q_network_target = self.q_network.get_target_network()
     self.x = FeatureData(float_features=torch.rand(5, 10))
     self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True)
     self.num_output_nodes = (len(self.metrics_to_score) + 1) * len(
         # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`.
         self.params.actions
     )
     self.reward_network = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.num_output_nodes,
         sizes=self.sizes,
         activations=self.activations,
     )
     self.q_network_cpe = FullyConnectedDQN(
         state_dim=self.state_dim,
         action_dim=self.num_output_nodes,
         sizes=self.sizes,
         activations=self.activations,
     )
     self.q_network_cpe_target = self.q_network_cpe.get_target_network()
Beispiel #8
0
    def setUp(self):
        # preparing various components for qr-dqn trainer initialization
        self.batch_size = 3
        self.state_dim = 10
        self.action_dim = 2
        self.num_layers = 2
        self.sizes = [20 for _ in range(self.num_layers)]
        self.num_atoms = 11
        self.activations = ["relu" for _ in range(self.num_layers)]
        self.dropout_ratio = 0
        self.exploration_variance = 1e-10

        self.actions = [str(i) for i in range(self.action_dim)]
        self.params = CRRTrainerParameters(actions=self.actions)
        self.reward_options = RewardOptions()
        self.metrics_to_score = get_metrics_to_score(
            self.reward_options.metric_reward_values
        )

        self.actor_network = FullyConnectedActor(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            exploration_variance=self.exploration_variance,
        )
        self.actor_network_target = self.actor_network.get_target_network()

        self.q1_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q1_network_target = self.q1_network.get_target_network()

        self.q2_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            sizes=self.sizes,
            activations=self.activations,
            dropout_ratio=self.dropout_ratio,
        )
        self.q2_network_target = self.q2_network.get_target_network()

        self.num_output_nodes = (len(self.metrics_to_score) + 1) * len(
            self.params.actions
        )
        self.eval_parameters = EvaluationParameters(calc_cpe_in_training=True)
        self.reward_network = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe = FullyConnectedDQN(
            state_dim=self.state_dim,
            action_dim=self.num_output_nodes,
            sizes=self.sizes,
            activations=self.activations,
        )
        self.q_network_cpe_target = self.q_network_cpe.get_target_network()
        self.inp = DiscreteDqnInput(
            state=FeatureData(
                float_features=torch.rand(self.batch_size, self.state_dim)
            ),
            next_state=FeatureData(
                float_features=torch.rand(self.batch_size, self.state_dim)
            ),
            reward=torch.ones(self.batch_size, 1),
            time_diff=torch.ones(self.batch_size, 1) * 2,
            step=torch.ones(self.batch_size, 1) * 2,
            not_terminal=torch.ones(
                self.batch_size, 1
            ),  # todo: check terminal behavior
            action=torch.tensor([[0, 1], [1, 0], [0, 1]]),
            next_action=torch.tensor([[1, 0], [0, 1], [1, 0]]),
            possible_actions_mask=torch.ones(self.batch_size, self.action_dim),
            possible_next_actions_mask=torch.ones(self.batch_size, self.action_dim),
            extras=ExtraData(action_probability=torch.ones(self.batch_size, 1)),
        )
Beispiel #9
0
    def build_trainer(
        self,
        normalization_data_map: Dict[str, NormalizationData],
        use_gpu: bool,
        reward_options: Optional[RewardOptions] = None,
    ) -> DiscreteCRRTrainer:
        actor_net_builder = self.actor_net_builder.value
        actor_network = actor_net_builder.build_actor(
            normalization_data_map[NormalizationKey.STATE],
            len(self.action_names))
        actor_network_target = actor_network.get_target_network()

        # The arguments to q_network1 and q_network2 below are modeled after those in discrete_dqn.py
        critic_net_builder = self.critic_net_builder.value

        q1_network = critic_net_builder.build_q_network(
            self.state_feature_config,
            normalization_data_map[NormalizationKey.STATE],
            len(self.action_names),
        )
        q1_network_target = q1_network.get_target_network()

        q2_network = q2_network_target = None
        # pyre-fixme[16]: `CRRTrainerParameters` has no attribute
        #  `double_q_learning`.
        if self.trainer_param.double_q_learning:
            q2_network = critic_net_builder.build_q_network(
                self.state_feature_config,
                normalization_data_map[NormalizationKey.STATE],
                len(self.action_names),
            )
            q2_network_target = q2_network.get_target_network()

        reward_options = reward_options or RewardOptions()
        metrics_to_score = get_metrics_to_score(
            reward_options.metric_reward_values)

        reward_network, q_network_cpe, q_network_cpe_target = None, None, None
        if self.eval_parameters.calc_cpe_in_training:
            # Metrics + reward
            num_output_nodes = (len(metrics_to_score) + 1) * len(
                # pyre-fixme[16]: `CRRTrainerParameters` has no attribute `actions`.
                self.trainer_param.actions)

            cpe_net_builder = self.cpe_net_builder.value
            reward_network = cpe_net_builder.build_q_network(
                self.state_feature_config,
                normalization_data_map[NormalizationKey.STATE],
                num_output_nodes,
            )
            q_network_cpe = cpe_net_builder.build_q_network(
                self.state_feature_config,
                normalization_data_map[NormalizationKey.STATE],
                num_output_nodes,
            )

            q_network_cpe_target = q_network_cpe.get_target_network()

        trainer = DiscreteCRRTrainer(
            actor_network=actor_network,
            actor_network_target=actor_network_target,
            q1_network=q1_network,
            q1_network_target=q1_network_target,
            reward_network=reward_network,
            q2_network=q2_network,
            q2_network_target=q2_network_target,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
            metrics_to_score=metrics_to_score,
            evaluation=self.eval_parameters,
            # pyre-fixme[16]: `CRRTrainerParameters` has no attribute `asdict`.
            **self.trainer_param.asdict(),
        )
        return trainer