Beispiel #1
0
    def _create_rl_train_net(self) -> None:
        self.rl_train_model = ModelHelper(name="rl_train_" + self.model_id)
        C2.set_model(self.rl_train_model)

        if self.reward_shape is not None:
            for action_index, boost in self.reward_shape.items():
                action_boost = C2.Mul(
                    C2.Slice("actions",
                             starts=[0, action_index],
                             ends=[-1, action_index + 1]),
                    boost,
                    broadcast=1,
                )
                C2.net().Sum(["rewards", action_boost], ["rewards"])

        if self.maxq_learning:
            next_q_values = self.get_max_q_values(
                "next_states", self.get_possible_next_actions(), True)
        else:
            next_q_values = self.get_q_values("next_states", "next_actions",
                                              True)

        discount_blob = C2.ConstantFill("time_diff",
                                        value=self.rl_discount_rate)
        time_diff_adjusted_discount_blob = C2.Pow(
            discount_blob, C2.Cast("time_diff",
                                   to=caffe2_pb2.TensorProto.FLOAT))

        q_vals_target = C2.Add(
            "rewards",
            C2.Mul(
                C2.Mul(
                    C2.Cast("not_terminals",
                            to=caffe2_pb2.TensorProto.FLOAT),  # type: ignore
                    time_diff_adjusted_discount_blob,
                    broadcast=1,
                ),
                next_q_values,
            ),
        )

        self.update_model("states", "actions", q_vals_target)
        workspace.RunNetOnce(self.rl_train_model.param_init_net)
        workspace.CreateNet(self.rl_train_model.net)
        C2.set_model(None)
Beispiel #2
0
 def _create_reward_train_net(self) -> None:
     self.reward_train_model = ModelHelper(name="reward_train_" +
                                           self.model_id)
     C2.set_model(self.reward_train_model)
     if self.reward_shape is not None:
         for action_index, boost in self.reward_shape.items():
             action_boost = C2.Mul(
                 C2.Slice("actions",
                          starts=[0, action_index],
                          ends=[-1, action_index + 1]),
                 boost,
                 broadcast=1,
             )
             C2.net().Sum(["rewards", action_boost], ["rewards"])
     self.update_model("states", "actions", "rewards")
     workspace.RunNetOnce(self.reward_train_model.param_init_net)
     workspace.CreateNet(self.reward_train_model.net)
     C2.set_model(None)
    def _create_rl_train_net(self) -> None:
        self.rl_train_model = ModelHelper(name="rl_train_" + self.model_id)
        C2.set_model(self.rl_train_model)

        if self.reward_shape is not None:
            for action_index, boost in self.reward_shape.items():
                action_boost = C2.Mul(
                    C2.Slice(
                        'actions',
                        starts=[0, action_index],
                        ends=[-1, action_index + 1],
                    ),
                    boost,
                    broadcast=1,
                )
                C2.net().Sum(['rewards', action_boost], ['rewards'])

        if self.maxq_learning:
            next_q_values = self.get_max_q_values(
                'next_states',
                self.get_possible_next_actions(),
                True,
            )
        else:
            next_q_values = self.get_q_values('next_states', 'next_actions',
                                              True)

        q_vals_target = C2.Add(
            'rewards',
            C2.Mul(
                C2.Mul(
                    C2.Cast('not_terminals',
                            to=caffe2_pb2.TensorProto.FLOAT),  # type: ignore
                    self.rl_discount_rate,
                    broadcast=1,
                ),
                next_q_values))

        self.update_model('states', 'actions', q_vals_target)
        workspace.RunNetOnce(self.rl_train_model.param_init_net)
        workspace.CreateNet(self.rl_train_model.net)
        C2.set_model(None)
Beispiel #4
0
 def _create_reward_train_net(self) -> None:
     self.reward_train_model = ModelHelper(name="reward_train_" +
                                           self.model_id)
     C2.set_model(self.reward_train_model)
     if self.reward_shape is not None:
         for action_index, boost in self.reward_shape.items():
             action_boost = C2.Mul(
                 C2.Slice("actions",
                          starts=[0, action_index],
                          ends=[-1, action_index + 1]),
                 boost,
                 broadcast=1,
             )
             C2.net().Sum(["rewards", action_boost], ["rewards"])
     self.update_model("states", "actions", "rewards")
     workspace.RunNetOnce(self.reward_train_model.param_init_net)
     self.reward_train_model.net.Proto().num_workers = (
         RLTrainer.DEFAULT_TRAINING_NUM_WORKERS)
     self.reward_train_model.net.Proto().type = "async_scheduling"
     workspace.CreateNet(self.reward_train_model.net)
     C2.set_model(None)
Beispiel #5
0
    def _forward_pass(cls, model, trainer, normalized_dense_matrix, actions):
        C2.set_model(model)

        parameters = []
        q_values = "q_values"
        workspace.FeedBlob(q_values, np.zeros(1, dtype=np.float32))
        trainer.build_predictor(model, normalized_dense_matrix, q_values)
        parameters.extend(model.GetAllParams())

        action_names = C2.NextBlob("action_names")
        parameters.append(action_names)
        workspace.FeedBlob(action_names, np.array(actions))
        action_range = C2.NextBlob("action_range")
        parameters.append(action_range)
        workspace.FeedBlob(action_range, np.array(list(range(len(actions)))))

        output_shape = C2.Shape(q_values)
        output_shape_row_count = C2.Slice(output_shape, starts=[0], ends=[1])

        output_row_shape = C2.Slice(q_values, starts=[0, 0], ends=[-1, 1])

        output_feature_keys = 'output/string_weighted_multi_categorical_features.keys'
        workspace.FeedBlob(output_feature_keys, np.zeros(1, dtype=np.int64))
        output_feature_keys_matrix = C2.ConstantFill(
            output_row_shape, value=0, dtype=caffe2_pb2.TensorProto.INT64)
        # Note: sometimes we need to use an explicit output name, so we call
        #  C2.net().Fn(...)
        C2.net().FlattenToVec(
            [output_feature_keys_matrix],
            [output_feature_keys],
        )

        output_feature_lengths = \
            'output/string_weighted_multi_categorical_features.lengths'
        workspace.FeedBlob(output_feature_lengths, np.zeros(1, dtype=np.int32))
        output_feature_lengths_matrix = C2.ConstantFill(
            output_row_shape, value=1, dtype=caffe2_pb2.TensorProto.INT32)
        C2.net().FlattenToVec(
            [output_feature_lengths_matrix],
            [output_feature_lengths],
        )

        output_keys = 'output/string_weighted_multi_categorical_features.values.keys'
        workspace.FeedBlob(output_keys, np.array(['a']))
        C2.net().Tile([action_names, output_shape_row_count], [output_keys],
                      axis=1)

        output_lengths_matrix = C2.ConstantFill(
            output_row_shape,
            value=len(actions),
            dtype=caffe2_pb2.TensorProto.INT32)
        output_lengths = \
            'output/string_weighted_multi_categorical_features.values.lengths'
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().FlattenToVec(
            [output_lengths_matrix],
            [output_lengths],
        )

        output_values = \
            'output/string_weighted_multi_categorical_features.values.values'
        workspace.FeedBlob(output_values, np.array([1.0]))
        C2.net().FlattenToVec([q_values], [output_values])
        return parameters, q_values