Ejemplo n.º 1
0
    def process(
            self, sparse_data: StackedAssociativeArray
    ) -> Tuple[str, str, List[str]]:
        lengths_blob = sparse_data.lengths
        keys_blob = sparse_data.keys
        values_blob = sparse_data.values

        MISSING_SCALAR = C2.NextBlob("MISSING_SCALAR")
        missing_value = 0.0 if self.set_missing_value_to_zero else MISSING_VALUE
        workspace.FeedBlob(MISSING_SCALAR,
                           np.array([missing_value], dtype=np.float32))
        C2.net().GivenTensorFill([], [MISSING_SCALAR],
                                 shape=[],
                                 values=[missing_value])

        parameters: List[str] = [MISSING_SCALAR]

        assert len(self.sorted_features) > 0, "Sorted features is empty"
        dense_input = C2.NextBlob("dense_input")
        dense_input_presence = C2.NextBlob("dense_input_presence")
        C2.net().SparseToDenseMask(
            [keys_blob, values_blob, MISSING_SCALAR, lengths_blob],
            [dense_input, dense_input_presence],
            mask=self.sorted_features,
            return_presence_mask=True,
        )

        if self.set_missing_value_to_zero:
            dense_input_presence = C2.And(
                C2.GT(dense_input, -1e-4, broadcast=1),
                C2.LT(dense_input, 1e-4, broadcast=1),
            )

        return dense_input, dense_input_presence, parameters
Ejemplo n.º 2
0
    def preprocess_blob(self, blob, normalization_parameters):
        """
        Takes in a blob and its normalization parameters. Outputs a tuple
        whose first element is a blob containing the normalized input blob
        and whose second element contains all the parameter blobs used to
        create it.

        Call this from a CPU context and ensure the input blob exists in it.
        """

        parameters: List[str] = []

        ZERO = self._store_parameter(parameters, "ZERO",
                                     np.array([0], dtype=np.float32))

        MISSING_U = self._store_parameter(
            parameters, "MISSING_U",
            np.array([MISSING_VALUE + 1e-4], dtype=np.float32))
        MISSING_L = self._store_parameter(
            parameters, "MISSING_L",
            np.array([MISSING_VALUE - 1e-4], dtype=np.float32))

        is_empty_l = C2.GT(blob, MISSING_L, broadcast=1)
        is_empty_u = C2.LT(blob, MISSING_U, broadcast=1)
        is_empty = C2.And(is_empty_l, is_empty_u)

        for i in range(len(normalization_parameters) - 1):
            if (normalization_parameters[i].feature_type !=
                    normalization_parameters[i + 1].feature_type):
                raise Exception(
                    "Only one feature type is allowed per call to preprocess_blob!"
                )
        feature_type = normalization_parameters[0].feature_type
        if feature_type == identify_types.BINARY:
            TOLERANCE = self._store_parameter(parameters, "TOLERANCE",
                                              np.array(1e-3, dtype=np.float32))
            is_gt_zero = C2.GT(blob,
                               C2.Add(ZERO, TOLERANCE, broadcast=1),
                               broadcast=1)
            is_lt_zero = C2.LT(blob,
                               C2.Sub(ZERO, TOLERANCE, broadcast=1),
                               broadcast=1)
            bool_blob = C2.Or(is_gt_zero, is_lt_zero)
            blob = C2.Cast(bool_blob, to=caffe2_pb2.TensorProto.FLOAT)
        elif feature_type == identify_types.PROBABILITY:
            blob = C2.Logit(C2.Clip(blob, min=0.01, max=0.99))
        elif feature_type == identify_types.ENUM:
            for parameter in normalization_parameters:
                possible_values = parameter.possible_values
                for x in possible_values:
                    if x < 0:
                        logger.fatal(
                            "Invalid enum possible value for feature: " +
                            str(x) + " " + str(parameter.possible_values))
                        raise Exception(
                            "Invalid enum possible value for feature " + blob +
                            ": " + str(x) + " " +
                            str(parameter.possible_values))

            int_blob = C2.Cast(blob, to=core.DataType.INT32)

            # Batch one hot transform with MISSING_VALUE as a possible value
            feature_lengths = [
                len(p.possible_values) + 1 for p in normalization_parameters
            ]
            feature_lengths_blob = self._store_parameter(
                parameters,
                "feature_lengths_blob",
                np.array(feature_lengths, dtype=np.int32),
            )

            feature_values = [
                x for p in normalization_parameters
                for x in p.possible_values + [int(MISSING_VALUE)]
            ]
            feature_values_blob = self._store_parameter(
                parameters,
                "feature_values_blob",
                np.array(feature_values, dtype=np.int32),
            )

            one_hot_output = C2.BatchOneHot(int_blob, feature_lengths_blob,
                                            feature_values_blob)
            flattened_one_hot = C2.FlattenToVec(one_hot_output)

            # Remove missing values with a mask
            cols_to_include = [[1] * len(p.possible_values) + [0]
                               for p in normalization_parameters]
            cols_to_include = [x for col in cols_to_include for x in col]
            mask = self._store_parameter(
                parameters, "mask", np.array(cols_to_include, dtype=np.int32))

            zero_vec = C2.ConstantFill(one_hot_output,
                                       value=0,
                                       dtype=caffe2_pb2.TensorProto.INT32)

            repeated_mask_bool = C2.Cast(C2.Add(zero_vec, mask, broadcast=1),
                                         to=core.DataType.BOOL)

            flattened_repeated_mask = C2.FlattenToVec(repeated_mask_bool)

            flattened_one_hot_proc = C2.NextBlob("flattened_one_hot_proc")
            flattened_one_hot_proc_indices = C2.NextBlob(
                "flattened_one_hot_proc_indices")
            C2.net().BooleanMask(
                [flattened_one_hot, flattened_repeated_mask],
                [flattened_one_hot_proc, flattened_one_hot_proc_indices],
            )

            one_hot_shape = C2.Shape(one_hot_output)

            shape_delta = self._store_parameter(
                parameters,
                "shape_delta",
                np.array([0, len(normalization_parameters)], dtype=np.int64),
            )

            target_shape = C2.Sub(one_hot_shape, shape_delta, broadcast=1)
            output_int_blob = C2.NextBlob("output_int_blob")
            output_int_blob_old_shape = C2.NextBlob(
                "output_int_blob_old_shape")
            C2.net().Reshape(
                [flattened_one_hot_proc, target_shape],
                [output_int_blob, output_int_blob_old_shape],
            )

            output_blob = C2.Cast(output_int_blob, to=core.DataType.FLOAT)

            return output_blob, parameters
        elif feature_type == identify_types.QUANTILE:
            # This transformation replaces a set of values with their quantile.
            # The quantile boundaries are provided in the normalization params.

            quantile_sizes = [
                len(norm.quantiles) for norm in normalization_parameters
            ]
            num_boundaries_blob = self._store_parameter(
                parameters,
                "num_boundaries_blob",
                np.array(quantile_sizes, dtype=np.int32),
            )

            quantile_values = np.array([], dtype=np.float32)
            quantile_labels = np.array([], dtype=np.float32)
            for norm in normalization_parameters:
                quantile_values = np.append(
                    quantile_values, np.array(norm.quantiles,
                                              dtype=np.float32))
                # TODO: Fix this: the np.unique is making this part not true.
                quantile_labels = np.append(
                    quantile_labels,
                    np.arange(len(norm.quantiles), dtype=np.float32) /
                    float(len(norm.quantiles)),
                )
            quantiles = np.vstack([quantile_values, quantile_labels]).T
            quantiles_blob = self._store_parameter(parameters,
                                                   "quantiles_blob", quantiles)

            quantile_blob = C2.Percentile(blob, quantiles_blob,
                                          num_boundaries_blob)
            blob = quantile_blob
        elif (feature_type == identify_types.CONTINUOUS
              or feature_type == identify_types.BOXCOX):
            boxcox_shifts = []
            boxcox_lambdas = []
            means = []
            stddevs = []

            for norm in normalization_parameters:
                if feature_type == identify_types.BOXCOX:
                    assert (norm.boxcox_shift is not None
                            and norm.boxcox_lambda is not None)
                    boxcox_shifts.append(norm.boxcox_shift)
                    boxcox_lambdas.append(norm.boxcox_lambda)
                means.append(norm.mean)
                stddevs.append(norm.stddev)

            if feature_type == identify_types.BOXCOX:
                boxcox_shift_blob = self._store_parameter(
                    parameters,
                    "boxcox_shift",
                    np.array(boxcox_shifts, dtype=np.float32),
                )
                boxcox_lambda_blob = self._store_parameter(
                    parameters,
                    "boxcox_shift",
                    np.array(boxcox_lambdas, dtype=np.float32),
                )

                blob = C2.BatchBoxCox(blob, boxcox_lambda_blob,
                                      boxcox_shift_blob)

            means_blob = self._store_parameter(
                parameters, "means_blob", np.array([means], dtype=np.float32))
            stddevs_blob = self._store_parameter(
                parameters, "stddevs_blob",
                np.array([stddevs], dtype=np.float32))

            blob = C2.Sub(blob, means_blob, broadcast=1, axis=0)
            blob = C2.Div(blob, stddevs_blob, broadcast=1, axis=0)
            if self.clip_anomalies:
                blob = C2.Clip(blob, min=-3.0, max=3.0)
        else:
            raise NotImplementedError(
                "Invalid feature type: {}".format(feature_type))

        zeros = C2.ConstantFill(blob, value=0.)
        output_blob = C2.Where(is_empty, zeros, blob)

        return output_blob, parameters