def testCastToDict(self): names = ['a', 'b', 'c'] got = tools.cast_to_dict(1.0, names, 'blah') self.assertEqual(got['a'], 1.0) self.assertEqual(got['b'], 1.0) self.assertEqual(got['c'], 1.0) self.assertItemsEqual(got.keys(), names) got = tools.cast_to_dict({'a': 1.0, 'b': 2.0, 'c': 3.0}, names, 'blah') self.assertEqual(got['a'], 1.0) self.assertEqual(got['b'], 2.0) self.assertEqual(got['c'], 3.0) self.assertItemsEqual(got.keys(), names) with self.assertRaisesRegexp( ValueError, 'Dict given for blah does not contain definition for feature "c"'): got = tools.cast_to_dict({'a': 1.0, 'b': 2.0}, names, 'blah') got = tools.cast_to_dict({'a': 1.0, tools.DEFAULT_NAME: 2.0}, names, 'blah') self.assertItemsEqual(got.keys(), names) self.assertEqual(got['a'], 1.0) self.assertEqual(got['b'], 2.0) self.assertEqual(got['c'], 2.0)
def input_calibration_layer(columns_to_tensors, num_keypoints, feature_columns=None, keypoints_initializers=None, keypoints_initializer_fns=None, bound=False, monotonic=None, missing_input_values=None, missing_output_values=None, dtype=dtypes.float32, **regularizer_amounts): """Creates a calibration layer for the given input and feature_columns. Returns a tensor with the calibrated values of the given features, a list of the names of the features in the order they feature in the returned, and a list of projection ops, that must be applied at each step (or every so many steps) to project the model to a feasible space: used for bounding the outputs or for imposing monotonic -- the list will be empty if bound and monotonic are not set. Args: columns_to_tensors: A mapping from feature name to tensors. 'string' key means a base feature (not-transformed). If feature_columns is not set these are the features calibrated. Otherwise the transformed feature_columns are the ones calibrated. num_keypoints: Number of keypoints to use. Either a single int, or a dict mapping feature names to num_keypoints. If a value of the dict is 0 or None the correspondent feature won't be calibrated. feature_columns: Optional. If set to a set of FeatureColumns, these will be the features used and calibrated. keypoints_initializers: For evaluation or inference (or when resuming training from a checkpoint) the values will be loaded from disk, so they don't need to be given (leave it as None). Either a tuple of two tensors of shape [num_keypoints], or a dict mapping feature names to pair of tensors of shape [num_keypoints[feature_name]]. See load_keypoints_from_quantiles or uniform_keypoints_for_signal on how to generate these (module keypoints_initialization). keypoints_initializer_fns: Like keypoints_initializers but using lambda initializers. They should be compatible with tf.get_variable. If this is set, then keypoints_initializers must be None. bound: boolean whether output of calibration must be bound. Alternatively a dict mapping feature name to boundness. monotonic: whether calibration has to be kept monotonic: None or 0 means no monotonic. Positive or negative values mean increasing or decreasing monotonic respectively. Alternatively a dict mapping feature name to monotonic. missing_input_values: If set, and if the input has this value it is assumed to be missing and the output will either be calibrated to some value between `[calibration_output_min, calibration_output_max]` or set to a fixed value set by missing_output_value. Limitation: it only works for scalars. Either one value for all inputs, or a dict mapping feature name to missing_input_value for the respective feature. missing_output_values: Requires missing_input_value also to be set. If set if will convert missing input to this value. Either one value for all inputs, or a dict mapping feature name to missing_input_value for the respective feature. dtype: If any of the scalars are not given as tensors, they are converted to tensors with this dtype. **regularizer_amounts: Keyword args of regularization amounts passed to regularizers.calibrator_regularization(). Keyword names should be among supported regularizers.CALIBRATOR_REGULARIZERS and values should be either float or {feature_name: float}. If float, then same value is applied to all features. Returns: A tuple of: * calibrated tensor of shape [batch_size, sum(features dimensions)]. * list of the feature names in the order they feature in the calibrated tensor. A name may appear more than once if the feature is multi-dimension (for instance a multi-dimension embedding) * list of projection ops, that must be applied at each step (or every so many steps) to project the model to a feasible space: used for bounding the outputs or for imposing monotonicity. Empty if none are requested. * None or tensor with regularization loss. Raises: ValueError: if dtypes are incompatible. """ with ops.name_scope('input_calibration_layer'): feature_names = tools.get_sorted_feature_names(columns_to_tensors, feature_columns) num_keypoints = tools.cast_to_dict(num_keypoints, feature_names, 'num_keypoints') bound = tools.cast_to_dict(bound, feature_names, 'bound') monotonic = tools.cast_to_dict(monotonic, feature_names, 'monotonic') keypoints_initializers = tools.cast_to_dict(keypoints_initializers, feature_names, 'keypoints_initializers') keypoints_initializer_fns = tools.cast_to_dict( keypoints_initializer_fns, feature_names, 'keypoints_initializer_fns') missing_input_values = tools.cast_to_dict(missing_input_values, feature_names, 'missing_input_values') missing_output_values = tools.cast_to_dict(missing_output_values, feature_names, 'missing_output_values') regularizer_amounts = { regularizer_name: tools.cast_to_dict(regularizer_amounts[regularizer_name], feature_names, regularizer_name) for regularizer_name in regularizer_amounts } per_dimension_feature_names = [] # Get uncalibrated tensors, either from columns_to_tensors, or using # feature_columns. if feature_columns is None: uncalibrated_features = [ columns_to_tensors[name] for name in feature_names ] else: transformed_columns_to_tensors = columns_to_tensors.copy() dict_feature_columns = { f_col.name: f_col for f_col in feature_columns } uncalibrated_features = [ tools.input_from_feature_column(transformed_columns_to_tensors, dict_feature_columns[name], dtype) for name in feature_names ] projection_ops = [] calibrated_splits = [] total_regularization = None for feature_idx in range(len(feature_names)): name = feature_names[feature_idx] uncalibrated_feature = uncalibrated_features[feature_idx] if uncalibrated_feature.shape.ndims == 1: feature_dim = 1 uncalibrated_splits = [uncalibrated_feature] elif uncalibrated_feature.shape.ndims == 2: feature_dim = uncalibrated_feature.shape.dims[1].value uncalibrated_splits = array_ops.unstack(uncalibrated_feature, axis=1) else: raise ValueError( 'feature {}: it has rank {}, but only ranks 1 or 2 are ' 'supported; feature shape={}'.format( name, uncalibrated_feature.shape.ndims, uncalibrated_feature.shape)) missing_input_value = missing_input_values[name] missing_output_value = missing_output_values[name] feature_regularizer_amounts = { regularizer_name: regularizer_amounts[regularizer_name][name] for regularizer_name in regularizer_amounts } # FutureWork: make the interpolation ops handle multi-dimension values, # so this step is not needed. for dim_idx in range(feature_dim): per_dimension_feature_names += [name] split_name = name if feature_dim > 1: split_name = '{}_dim_{}'.format(name, dim_idx) uncalibrated = uncalibrated_splits[dim_idx] if not num_keypoints[name]: # No calibration for this feature: calibrated_splits += [uncalibrated] if (missing_input_value is not None or missing_output_value is not None): raise ValueError( 'feature %s: cannot handle missing values if feature is not ' 'calibrated, missing_input_value=%s, missing_output_value=%s' % (name, missing_input_value, missing_output_value)) else: calibrated, projection, reg = one_dimensional_calibration_layer( uncalibrated, num_keypoints[name], signal_name=split_name, keypoints_initializers=keypoints_initializers[name], keypoints_initializer_fns=keypoints_initializer_fns[ name], bound=bound[name], monotonic=monotonic[name], missing_input_value=missing_input_value, missing_output_value=missing_output_value, **feature_regularizer_amounts) calibrated_splits += [calibrated] if projection is not None: projection_ops += [projection] total_regularization = tools.add_if_not_none( total_regularization, reg) all_calibrated = array_ops.stack(calibrated_splits, axis=1, name='stack_calibrated') return (all_calibrated, per_dimension_feature_names, projection_ops, total_regularization)
def load_keypoints_from_quantiles(feature_names, save_dir, num_keypoints, output_min=None, output_max=None, use_label_quantiles_for_outputs=False, reversed_dict=None, missing_input_values_dict=None, dtype=tf.float32): """Retrieves keypoints initialization values for selected features. It expects that the quantiles have already been calculated and saved in the save_dir by the save_quantiles_for_keypoints function. It will raise an I/O error if not. Args: feature_names: List of features names for which to get keypoints initialization values. save_dir: Directory where the quantiles have been saved to. Same value used when save_quantiles_for_keypoints was called. num_keypoints: Desired number of keypoints to use for calibration. This can either be a scalar to be used for all features, or a dict mapping feature name to num_keypoints. Fewer keypoints than requested can end up being used when for the given feature there are not enough different values. If num_keypoints for a feature is missing, None or 0, no initialization is generated. output_min: If not None, specifies the initial calibrated value associated with the first calibration keypoint. The keypoints outputs in between will be linearly interpolated. It can be given as a scalar, in which case the value is used for all features, or a dict mapping feature name to output_min. output_max: Like output_min, but the calibrated value associated to the last keypoint. Scalar or dict. use_label_quantiles_for_outputs: Sets the keypoint outputs (calibrated values) to the label quantiles. If this parameter is true then output_min and output_max must both be None and the label quantiles must have been saved in the call to save_quantiles_for_keypoints that generated the quantile files (i.e. the input_fn parameter for the latter function must have returned a label). If this parameter is False, then neither output_min nor output_max may be None. reversed_dict: An optional dict. If reversed_dict[feature_name] is True, then the initial output keypoints will be in reversed order for that feature, i.e., input_min will be mapped to output_max or the last label quantile if use_label_quantiles_for_outputs is true, and input_max will be mapped to output_min or the first label quantile if use_label_quantiles_for_outputs is true. Reversing output keypoints is useful for decreasing monotonic calibrators. missing_input_values_dict: An optional dict. If provided, it should include all features passed via feature_names. If the value of missing_input_values[feature_name] is Not none, it is excluded from the input keypoint values. dtype: Type to be used for calibration. Returns: Dict of feature name to pair of constant tensors that can be used to initialize calibrators keypoints inputs and outputs. Raises: tf.errors.NotFoundError: if quantiles file not found. values in the signal. This would probably be better handled as categorical, but still this should handle the case correctly. """ if (output_min is None) != (output_max is None): raise ValueError( "Either both output_min and output_max should be given or neither." ) output_labels_given = (output_min is not None) if (use_label_quantiles_for_outputs and output_labels_given): raise ValueError("If use_label_quantiles_for_outputs is true, then" " output_min and output_max cannot be given.") if (not use_label_quantiles_for_outputs and not output_labels_given): raise ValueError( "Either use_label_quantiles_for_outputs should be true or " " output_min and output_max must be given.") subdir = os.path.join(save_dir, _QUANTILES_SUBDIRECTORY) num_keypoints = tools.cast_to_dict(num_keypoints, feature_names, num_keypoints) if use_label_quantiles_for_outputs: label_quantiles = _load_quantiles(subdir, _LABEL_FEATURE_NAME) else: label_quantiles = None output_min = tools.cast_to_dict_of_tensor_scalars( output_min, feature_names, dtype, "output_min") output_max = tools.cast_to_dict_of_tensor_scalars( output_max, feature_names, dtype, "output_max") keypoints = {} for feature_name in feature_names: if feature_name not in num_keypoints or not num_keypoints[feature_name]: continue all_quantiles = _load_quantiles(subdir, feature_name) if (missing_input_values_dict is not None and feature_name in missing_input_values_dict): exclude_val = missing_input_values_dict[feature_name] if exclude_val is not None: all_quantiles = [q for q in all_quantiles if q != exclude_val] quantiles = _resample_quantiles(all_quantiles, num_keypoints[feature_name]) unique_quantiles = sorted(set(quantiles)) input_keypoints = tf.constant(unique_quantiles, shape=[len(unique_quantiles)], dtype=dtype) if use_label_quantiles_for_outputs: output_keypoints = tf.constant(_resample_quantiles( label_quantiles, len(unique_quantiles)), shape=[len(unique_quantiles)], dtype=dtype) else: output_keypoints = tf.linspace(output_min[feature_name], output_max[feature_name], len(unique_quantiles)) if reversed_dict is not None and reversed_dict[feature_name]: output_keypoints = tf.reverse(output_keypoints, axis=[0]) keypoints[feature_name] = (input_keypoints, output_keypoints) return keypoints
def load_keypoints_from_quantiles(feature_names, save_dir, num_keypoints, output_min, output_max, dtype=dtypes.float32): """Retrieves keypoints initialization values for selected features. It expects that the quantiles have already been calculated and saved in the save_dir by the save_quantiles_for_keypoints function. It will raise an I/O error if not. Args: feature_names: List of features names for which to get keypoints initialization values. save_dir: Directory where the quantiles have been saved to. Same value used when save_quantiles_for_keypoints was called. num_keypoints: Desired number of keypoints to use for calibration. This can either be a scalar to be used for all features, or a dict mapping feature name to num_keypoints. Fewer keypoints than requested can end up being used when for the given feature there are not enough different values. If num_keypoints for a feature is missing, None or 0, no initialization is generated. output_min: Initial calibrated value associated with the first calibration keypoint. The keypoints outputs in between will be linearly interpolated. It can be given as a scalar, in which case value is used for all features, or a dict mapping feature name to output_min. output_max: Like output_min, but the calibrated value associated to the last keypoint. Scalar or dict. dtype: Type to be used for calibration. Returns: Dict of feature name to pair of constant tensors that can be used to initialize calibrators keypoints inputs and outputs. Raises: tf.errors.NotFoundError: if quantiles file not found. values in the signal. This would probably be better handled as categorical, but still this should handle the case correctly. """ subdir = os.path.join(save_dir, _QUANTILES_SUBDIRECTORY) num_keypoints = tools.cast_to_dict(num_keypoints, feature_names, num_keypoints) output_min = tools.cast_to_dict_of_tensor_scalars(output_min, feature_names, dtype, "output_min") output_max = tools.cast_to_dict_of_tensor_scalars(output_max, feature_names, dtype, "output_max") keypoints = {} for feature_name in feature_names: if feature_name not in num_keypoints or not num_keypoints[feature_name]: continue all_quantiles = _load_quantiles(subdir, feature_name) percentiles = np.linspace(0., 100., num_keypoints[feature_name]) quantiles = np.percentile(all_quantiles, percentiles, interpolation="nearest") quantiles = sorted(set(quantiles)) # Remove repeated quantiles. keypoints[feature_name] = (array_ops.constant(quantiles, shape=[len(quantiles)], dtype=dtype), math_ops.linspace(output_min[feature_name], output_max[feature_name], len(quantiles))) return keypoints