Beispiel #1
0
  def testCastToDict(self):
    names = ['a', 'b', 'c']
    got = tools.cast_to_dict(1.0, names, 'blah')
    self.assertEqual(got['a'], 1.0)
    self.assertEqual(got['b'], 1.0)
    self.assertEqual(got['c'], 1.0)
    self.assertItemsEqual(got.keys(), names)

    got = tools.cast_to_dict({'a': 1.0, 'b': 2.0, 'c': 3.0}, names, 'blah')
    self.assertEqual(got['a'], 1.0)
    self.assertEqual(got['b'], 2.0)
    self.assertEqual(got['c'], 3.0)
    self.assertItemsEqual(got.keys(), names)

    with self.assertRaisesRegexp(
        ValueError,
        'Dict given for blah does not contain definition for feature "c"'):
      got = tools.cast_to_dict({'a': 1.0, 'b': 2.0}, names, 'blah')

    got = tools.cast_to_dict({'a': 1.0, tools.DEFAULT_NAME: 2.0}, names, 'blah')
    self.assertItemsEqual(got.keys(), names)
    self.assertEqual(got['a'], 1.0)
    self.assertEqual(got['b'], 2.0)
    self.assertEqual(got['c'], 2.0)
def input_calibration_layer(columns_to_tensors,
                            num_keypoints,
                            feature_columns=None,
                            keypoints_initializers=None,
                            keypoints_initializer_fns=None,
                            bound=False,
                            monotonic=None,
                            missing_input_values=None,
                            missing_output_values=None,
                            dtype=dtypes.float32,
                            **regularizer_amounts):
    """Creates a calibration layer for the given input and feature_columns.

  Returns a tensor with the calibrated values of the given features, a list
  of the names of the features in the order they feature in the returned, and
  a list of projection ops, that must be applied at each step (or every so many
  steps) to project the model to a feasible space: used for bounding the outputs
  or for imposing monotonic -- the list will be empty if bound and
  monotonic are not set.

  Args:
    columns_to_tensors: A mapping from feature name to tensors. 'string' key
      means a base feature (not-transformed). If feature_columns is not set
      these are the features calibrated. Otherwise the transformed
      feature_columns are the ones calibrated.
    num_keypoints: Number of keypoints to use. Either a single int, or a dict
      mapping feature names to num_keypoints. If a value of the dict is 0 or
      None the correspondent feature won't be calibrated.
    feature_columns: Optional. If set to a set of FeatureColumns, these will
      be the features used and calibrated.
    keypoints_initializers: For evaluation or inference (or when resuming
      training from a checkpoint) the values will be loaded from disk, so they
      don't need to be given (leave it as None).
      Either a tuple of two tensors of shape [num_keypoints], or a dict mapping
      feature names to pair of tensors of shape [num_keypoints[feature_name]].
      See load_keypoints_from_quantiles or uniform_keypoints_for_signal on how
      to generate these (module keypoints_initialization).
    keypoints_initializer_fns: Like keypoints_initializers but using lambda
      initializers. They should be compatible with tf.get_variable. If this is
      set, then keypoints_initializers must be None.
    bound: boolean whether output of calibration must be bound. Alternatively
      a dict mapping feature name to boundness.
    monotonic: whether calibration has to be kept monotonic: None or 0 means
      no monotonic. Positive or negative values mean increasing or decreasing
      monotonic respectively. Alternatively a dict mapping feature name
      to monotonic.
    missing_input_values: If set, and if the input has this value it is assumed
      to be missing and the output will either be calibrated to some value
      between `[calibration_output_min, calibration_output_max]` or set to a
      fixed value set by missing_output_value. Limitation: it only works for
      scalars. Either one value for all inputs, or a dict mapping feature name
      to missing_input_value for the respective feature.
    missing_output_values: Requires missing_input_value also to be set. If set
      if will convert missing input to this value. Either one value for all
      inputs, or a dict mapping feature name to missing_input_value for the
      respective feature.
    dtype: If any of the scalars are not given as tensors, they are converted
      to tensors with this dtype.
    **regularizer_amounts: Keyword args of regularization amounts passed to
      regularizers.calibrator_regularization(). Keyword names should be among
      supported regularizers.CALIBRATOR_REGULARIZERS and values should be
      either float or {feature_name: float}. If float, then same value is
      applied to all features.

  Returns:
    A tuple of:
    * calibrated tensor of shape [batch_size, sum(features dimensions)].
    * list of the feature names in the order they feature in the calibrated
      tensor. A name may appear more than once if the feature is
      multi-dimension (for instance a multi-dimension embedding)
    * list of projection ops, that must be applied at each step (or every so
      many steps) to project the model to a feasible space: used for bounding
      the outputs or for imposing monotonicity. Empty if none are requested.
    * None or tensor with regularization loss.

  Raises:
    ValueError: if dtypes are incompatible.


  """
    with ops.name_scope('input_calibration_layer'):
        feature_names = tools.get_sorted_feature_names(columns_to_tensors,
                                                       feature_columns)
        num_keypoints = tools.cast_to_dict(num_keypoints, feature_names,
                                           'num_keypoints')
        bound = tools.cast_to_dict(bound, feature_names, 'bound')
        monotonic = tools.cast_to_dict(monotonic, feature_names, 'monotonic')
        keypoints_initializers = tools.cast_to_dict(keypoints_initializers,
                                                    feature_names,
                                                    'keypoints_initializers')
        keypoints_initializer_fns = tools.cast_to_dict(
            keypoints_initializer_fns, feature_names,
            'keypoints_initializer_fns')
        missing_input_values = tools.cast_to_dict(missing_input_values,
                                                  feature_names,
                                                  'missing_input_values')
        missing_output_values = tools.cast_to_dict(missing_output_values,
                                                   feature_names,
                                                   'missing_output_values')
        regularizer_amounts = {
            regularizer_name:
            tools.cast_to_dict(regularizer_amounts[regularizer_name],
                               feature_names, regularizer_name)
            for regularizer_name in regularizer_amounts
        }

        per_dimension_feature_names = []

        # Get uncalibrated tensors, either from columns_to_tensors, or using
        # feature_columns.
        if feature_columns is None:
            uncalibrated_features = [
                columns_to_tensors[name] for name in feature_names
            ]
        else:
            transformed_columns_to_tensors = columns_to_tensors.copy()
            dict_feature_columns = {
                f_col.name: f_col
                for f_col in feature_columns
            }
            uncalibrated_features = [
                tools.input_from_feature_column(transformed_columns_to_tensors,
                                                dict_feature_columns[name],
                                                dtype)
                for name in feature_names
            ]

        projection_ops = []
        calibrated_splits = []
        total_regularization = None
        for feature_idx in range(len(feature_names)):
            name = feature_names[feature_idx]
            uncalibrated_feature = uncalibrated_features[feature_idx]
            if uncalibrated_feature.shape.ndims == 1:
                feature_dim = 1
                uncalibrated_splits = [uncalibrated_feature]
            elif uncalibrated_feature.shape.ndims == 2:
                feature_dim = uncalibrated_feature.shape.dims[1].value
                uncalibrated_splits = array_ops.unstack(uncalibrated_feature,
                                                        axis=1)
            else:
                raise ValueError(
                    'feature {}: it has rank {}, but only ranks 1 or 2 are '
                    'supported; feature shape={}'.format(
                        name, uncalibrated_feature.shape.ndims,
                        uncalibrated_feature.shape))
            missing_input_value = missing_input_values[name]
            missing_output_value = missing_output_values[name]
            feature_regularizer_amounts = {
                regularizer_name: regularizer_amounts[regularizer_name][name]
                for regularizer_name in regularizer_amounts
            }

            # FutureWork: make the interpolation ops handle multi-dimension values,
            #   so this step is not needed.
            for dim_idx in range(feature_dim):
                per_dimension_feature_names += [name]
                split_name = name
                if feature_dim > 1:
                    split_name = '{}_dim_{}'.format(name, dim_idx)
                uncalibrated = uncalibrated_splits[dim_idx]
                if not num_keypoints[name]:
                    # No calibration for this feature:
                    calibrated_splits += [uncalibrated]
                    if (missing_input_value is not None
                            or missing_output_value is not None):
                        raise ValueError(
                            'feature %s: cannot handle missing values if feature is not '
                            'calibrated, missing_input_value=%s, missing_output_value=%s'
                            %
                            (name, missing_input_value, missing_output_value))
                else:
                    calibrated, projection, reg = one_dimensional_calibration_layer(
                        uncalibrated,
                        num_keypoints[name],
                        signal_name=split_name,
                        keypoints_initializers=keypoints_initializers[name],
                        keypoints_initializer_fns=keypoints_initializer_fns[
                            name],
                        bound=bound[name],
                        monotonic=monotonic[name],
                        missing_input_value=missing_input_value,
                        missing_output_value=missing_output_value,
                        **feature_regularizer_amounts)
                    calibrated_splits += [calibrated]
                    if projection is not None:
                        projection_ops += [projection]
                    total_regularization = tools.add_if_not_none(
                        total_regularization, reg)

        all_calibrated = array_ops.stack(calibrated_splits,
                                         axis=1,
                                         name='stack_calibrated')
        return (all_calibrated, per_dimension_feature_names, projection_ops,
                total_regularization)
Beispiel #3
0
def load_keypoints_from_quantiles(feature_names,
                                  save_dir,
                                  num_keypoints,
                                  output_min=None,
                                  output_max=None,
                                  use_label_quantiles_for_outputs=False,
                                  reversed_dict=None,
                                  missing_input_values_dict=None,
                                  dtype=tf.float32):
    """Retrieves keypoints initialization values for selected features.

  It expects that the quantiles have already been calculated and saved in the
  save_dir by the save_quantiles_for_keypoints function. It will raise
  an I/O error if not.

  Args:
    feature_names: List of features names for which to get keypoints
      initialization values.
    save_dir: Directory where the quantiles have been saved to. Same value used
      when save_quantiles_for_keypoints was called.
    num_keypoints: Desired number of keypoints to use for calibration. This can
      either be a scalar to be used for all features, or a dict mapping feature
      name to num_keypoints. Fewer keypoints than requested can end up being
      used when for the given feature there are not enough different values. If
      num_keypoints for a feature is missing, None or 0, no initialization is
      generated.
    output_min: If not None, specifies the initial calibrated value associated
      with the first calibration keypoint. The keypoints outputs in between will
      be linearly interpolated.  It can be given as a scalar, in which case the
      value is used for all features, or a dict mapping feature name to
      output_min.
    output_max: Like output_min, but the calibrated value associated to the last
      keypoint. Scalar or dict.
    use_label_quantiles_for_outputs: Sets the keypoint outputs (calibrated
      values) to the label quantiles. If this parameter is true then output_min
      and output_max must both be None and the label quantiles must have been
      saved in the call to save_quantiles_for_keypoints that generated the
      quantile files (i.e. the input_fn parameter for the latter function must
      have returned a label). If this parameter is False, then neither
      output_min nor output_max may be None.
    reversed_dict: An optional dict. If reversed_dict[feature_name] is True,
      then the initial output keypoints will be in reversed order for that
      feature, i.e., input_min will be mapped to output_max or the last label
      quantile if use_label_quantiles_for_outputs is true, and input_max will be
      mapped to output_min or the first label quantile if
      use_label_quantiles_for_outputs is true. Reversing output keypoints is
      useful for decreasing monotonic calibrators.
    missing_input_values_dict: An optional dict. If provided, it should include
      all features passed via feature_names. If the value of
      missing_input_values[feature_name] is Not none, it is excluded from the
      input keypoint values.
    dtype: Type to be used for calibration.

  Returns:
    Dict of feature name to pair of constant tensors that can be used to
    initialize calibrators keypoints inputs and outputs.

  Raises:
    tf.errors.NotFoundError: if quantiles file not found.


    values in the signal. This would probably be better handled as categorical,
    but still this should handle the case correctly.
  """
    if (output_min is None) != (output_max is None):
        raise ValueError(
            "Either both output_min and output_max should be given or neither."
        )

    output_labels_given = (output_min is not None)
    if (use_label_quantiles_for_outputs and output_labels_given):
        raise ValueError("If use_label_quantiles_for_outputs is true, then"
                         " output_min and output_max cannot be given.")
    if (not use_label_quantiles_for_outputs and not output_labels_given):
        raise ValueError(
            "Either use_label_quantiles_for_outputs should be true or "
            " output_min and output_max must be given.")

    subdir = os.path.join(save_dir, _QUANTILES_SUBDIRECTORY)
    num_keypoints = tools.cast_to_dict(num_keypoints, feature_names,
                                       num_keypoints)
    if use_label_quantiles_for_outputs:
        label_quantiles = _load_quantiles(subdir, _LABEL_FEATURE_NAME)
    else:
        label_quantiles = None
        output_min = tools.cast_to_dict_of_tensor_scalars(
            output_min, feature_names, dtype, "output_min")
        output_max = tools.cast_to_dict_of_tensor_scalars(
            output_max, feature_names, dtype, "output_max")
    keypoints = {}
    for feature_name in feature_names:
        if feature_name not in num_keypoints or not num_keypoints[feature_name]:
            continue
        all_quantiles = _load_quantiles(subdir, feature_name)
        if (missing_input_values_dict is not None
                and feature_name in missing_input_values_dict):
            exclude_val = missing_input_values_dict[feature_name]
            if exclude_val is not None:
                all_quantiles = [q for q in all_quantiles if q != exclude_val]
        quantiles = _resample_quantiles(all_quantiles,
                                        num_keypoints[feature_name])
        unique_quantiles = sorted(set(quantiles))
        input_keypoints = tf.constant(unique_quantiles,
                                      shape=[len(unique_quantiles)],
                                      dtype=dtype)
        if use_label_quantiles_for_outputs:
            output_keypoints = tf.constant(_resample_quantiles(
                label_quantiles, len(unique_quantiles)),
                                           shape=[len(unique_quantiles)],
                                           dtype=dtype)
        else:
            output_keypoints = tf.linspace(output_min[feature_name],
                                           output_max[feature_name],
                                           len(unique_quantiles))
        if reversed_dict is not None and reversed_dict[feature_name]:
            output_keypoints = tf.reverse(output_keypoints, axis=[0])
        keypoints[feature_name] = (input_keypoints, output_keypoints)
    return keypoints
Beispiel #4
0
def load_keypoints_from_quantiles(feature_names,
                                  save_dir,
                                  num_keypoints,
                                  output_min,
                                  output_max,
                                  dtype=dtypes.float32):
    """Retrieves keypoints initialization values for selected features.

  It expects that the quantiles have already been calculated and saved in the
  save_dir by the save_quantiles_for_keypoints function. It will raise
  an I/O error if not.

  Args:
    feature_names: List of features names for which to get keypoints
      initialization values.
    save_dir: Directory where the quantiles have been saved to. Same value used
      when save_quantiles_for_keypoints was called.
    num_keypoints: Desired number of keypoints to use for calibration. This
      can either be a scalar to be used for all features, or a dict mapping
      feature name to num_keypoints. Fewer keypoints than requested can end
      up being used when for the given feature there are not enough different
      values. If num_keypoints for a feature is missing, None or 0, no
      initialization is generated.
    output_min: Initial calibrated value associated with the first calibration
      keypoint. The keypoints outputs in between will be linearly interpolated.
      It can be given as a scalar, in which case value is used for all features,
      or a dict mapping feature name to output_min.
    output_max: Like output_min, but the calibrated value associated to the
      last keypoint. Scalar or dict.
    dtype: Type to be used for calibration.

  Returns:
    Dict of feature name to pair of constant tensors that can be used to
    initialize calibrators keypoints inputs and outputs.

  Raises:
    tf.errors.NotFoundError: if quantiles file not found.


    values in the signal. This would probably be better handled as categorical,
    but still this should handle the case correctly.
  """
    subdir = os.path.join(save_dir, _QUANTILES_SUBDIRECTORY)
    num_keypoints = tools.cast_to_dict(num_keypoints, feature_names,
                                       num_keypoints)
    output_min = tools.cast_to_dict_of_tensor_scalars(output_min,
                                                      feature_names, dtype,
                                                      "output_min")
    output_max = tools.cast_to_dict_of_tensor_scalars(output_max,
                                                      feature_names, dtype,
                                                      "output_max")

    keypoints = {}
    for feature_name in feature_names:
        if feature_name not in num_keypoints or not num_keypoints[feature_name]:
            continue
        all_quantiles = _load_quantiles(subdir, feature_name)
        percentiles = np.linspace(0., 100., num_keypoints[feature_name])
        quantiles = np.percentile(all_quantiles,
                                  percentiles,
                                  interpolation="nearest")
        quantiles = sorted(set(quantiles))  # Remove repeated quantiles.
        keypoints[feature_name] = (array_ops.constant(quantiles,
                                                      shape=[len(quantiles)],
                                                      dtype=dtype),
                                   math_ops.linspace(output_min[feature_name],
                                                     output_max[feature_name],
                                                     len(quantiles)))
    return keypoints