def _print_warnings(self, arg_values, result_value): if isinstance(result_value, tf.Tensor): num_elements = tf_coder_utils.num_tensor_elements(result_value) elif isinstance(result_value, tf.SparseTensor): num_elements = tf_coder_utils.num_tensor_elements( result_value.values) else: return if num_elements > 10 * limits.MAX_TENSOR_ELEMENTS: print( 'Warning: {} produced much-too-large tensor of shape {} and {} ' 'elements.'.format(self.name, result_value.shape.as_list(), num_elements)) for i, arg_value in enumerate(arg_values): if isinstance(arg_value.value, tf.Tensor): print(' argument {} has shape {} and {} elements'.format( i, arg_value.shape, arg_value.num_elements())) if arg_value.num_elements() <= 20: print(' argument {} is: {}'.format( i, arg_value.value)) elif arg_value.is_primitive: print(' argument {} is: {}'.format(i, arg_value.value)) else: print(' argument {} has type {}'.format( i, type(arg_value.value))) print(' argument {} has reconstruction: {}'.format( i, arg_value.reconstruct_expression()))
def featurize_input_and_output( input_value: value_module.Value, output_value: value_module.Value) -> Dict[Text, List[float]]: """Returns a dict that featurizes relationships between an input and output. The feature dict will have the following keys (all start with 'io_'): * 'io_comparisons', list[int]: Ints in the range [0, 2] denoting which is larger according to various metrics. * 'io_counts', list[int]: Numbers of elements with various properties. * 'io_count_buckets', list[int]: Like 'io_counts' but bucketed. * 'io_fractions', list[float]: Fractions of elements with various properties. * 'io_booleans', list[int]: Ints in the range [0, 1] denoting various boolean properties. Args: input_value: The input Value. output_value: The output Value. """ input_tensor = _as_tensor_to_featurize(input_value) output_tensor = _as_tensor_to_featurize(output_value) features = {} # type: Dict[Text, List[float]] input_size = tf_coder_utils.num_tensor_elements(input_tensor) output_size = tf_coder_utils.num_tensor_elements(output_tensor) input_elements = _elements_list(input_tensor) output_elements = _elements_list(output_tensor) input_num_elements = len(input_elements) output_num_elements = len(output_elements) input_rank = len(input_tensor.shape) output_rank = len(output_tensor.shape) input_shape = _fixed_length_shape(input_tensor) output_shape = _fixed_length_shape(output_tensor) features['io_comparisons'] = [ _comparison(input_size, output_size), _comparison(input_num_elements, output_num_elements), _comparison(input_rank, output_rank), ] features['io_comparisons'].extend( [_comparison(input_length, output_length) for input_length, output_length in zip(input_shape, output_shape)]) # Count elements appearing in both the input and output. input_elements_set = set(input_elements) output_elements_set = set(output_elements) inputs_in_output = sum(e in output_elements_set for e in input_elements) outputs_in_input = sum(e in input_elements_set for e in output_elements) num_unique_overlaps = len(input_elements_set & output_elements_set) features['io_counts'] = [inputs_in_output, outputs_in_input, num_unique_overlaps] features['io_count_buckets'] = [_bucket(c, COUNT_BOUNDARIES) for c in features['io_counts']] features['io_fractions'] = [ _safe_divide(inputs_in_output, input_num_elements), _safe_divide(outputs_in_input, output_num_elements), _safe_divide(num_unique_overlaps, input_num_elements), _safe_divide(num_unique_overlaps, output_num_elements), ] all_inputs_in_output = int(inputs_in_output == len(input_elements)) all_outputs_in_input = int(outputs_in_input == len(output_elements)) features['io_booleans'] = [ int(input_tensor.shape.as_list() == output_tensor.shape.as_list()), int(input_tensor.dtype == output_tensor.dtype), all_inputs_in_output, all_outputs_in_input, ] features['io_booleans'].extend([ int(input_length in output_shape) if input_length else 0 for input_length in input_shape]) features['io_booleans'].extend([ int(output_length in input_shape) if output_length else 0 for output_length in output_shape]) return features
def featurize_value( value: value_module.Value) -> Dict[Text, Union[List[float], List[Text]]]: """Returns a dict that featurizes a Value. The feature dict will contain the following: * 'kind': One int denoting whether the value is a primitive, sequence, Tensor, SparseTensor, or other. * 'dtype': One int denoting the DType of the value. * 'rank': One int in the range [0, MAX_RANK], where MAX_RANK denotes any rank greater than or equal to MAX_RANK. * 'shape': A list of MAX_RANK ints denoting the shape, 0-padded. * 'shape_buckets': Like 'shape' but bucketed. * 'floats': Stats about the elements as raw floats. * 'float_buckets': Like 'floats' but bucketed. * 'counts': Numbers of elements with various properties. * 'count_buckets': Like 'counts' but bucketed. * 'fractions': Fractions of elements with various properties. * 'booleans': Boolean properties of the value, as 0-1 ints. * 'value_string': The input as a string. Args: value: The Value to featurize. """ to_featurize = _as_tensor_to_featurize(value) features = {} # type: Dict[Text, Union[List[float], List[Text]]] features['kind'] = [( 0 if value.is_primitive else # pylint: disable=g-long-ternary 1 if value.is_sequence else # pylint: disable=g-long-ternary 2 if value.is_tensor else 3 if value.is_sparse_tensor else 4)] supported_dtypes = ( tf_coder_utils.INT_DTYPES + tf_coder_utils.FLOAT_DTYPES + (tf.bool,)) if to_featurize.dtype not in supported_dtypes: raise ValueError('Cannot featurize unsupported dtype: {}'.format( to_featurize.dtype)) features['dtype'] = [supported_dtypes.index(to_featurize.dtype)] features['rank'] = [min(len(to_featurize.shape), MAX_RANK)] features['shape'] = _fixed_length_shape(to_featurize) features['shape_buckets'] = [ _bucket(dimension_length, COUNT_BOUNDARIES) for dimension_length in features['shape']] # "Elements" are the provided values (without default elements in # SparseTensors). Cast elements to float32 because, e.g., we can't compute the # mean of a bool tensor. elements = tf.cast(_elements(to_featurize), tf.float32) elements_list = _elements_list(to_featurize) max_value = tf_coder_utils.max_tensor_value(elements) min_value = tf_coder_utils.min_tensor_value(elements) mean_value = float(tf.reduce_mean(elements)) mean_magnitude = float(tf.reduce_mean(tf.abs(elements))) features['floats'] = [max_value, min_value, mean_value, mean_magnitude] features['float_buckets'] = [_bucket(f, FLOAT_BOUNDARIES) for f in features['floats']] # The total size of the tensor, meaning the product of dimension lengths. This # may be huge for SparseTensors. size = tf_coder_utils.num_tensor_elements(to_featurize) # For SparseTensors, num_elements != size. num_elements = len(elements_list) positive = int(tf.reduce_sum(tf.cast(elements > 0, tf.int32))) zeros = int(tf.reduce_sum(tf.cast( tf.abs(tf.cast(elements, tf.float32)) < EPSILON, tf.int32))) ones = int(tf.reduce_sum(tf.cast( tf.abs(tf.cast(elements, tf.float32) - 1.0) < EPSILON, tf.int32))) negative = int(tf.reduce_sum(tf.cast(elements < 0, tf.int32))) probabilities = int(tf.reduce_sum(tf.cast( tf.logical_and(0 <= elements, elements <= 1), tf.int32))) unique = len(set(elements_list)) counts = [size, num_elements, positive, zeros, ones, negative, probabilities, unique] features['counts'] = counts features['count_buckets'] = [_bucket(c, COUNT_BOUNDARIES) for c in features['counts']] assert counts[0] == size and counts[1] == num_elements features['fractions'] = [_safe_divide(num_elements, size)] features['fractions'].extend([_safe_divide(c, num_elements) for c in counts[2:]]) # TODO(kshi): Consider computing the number of times the mode appears. # TODO(kshi): Consider computing, for each axis, whether that axis represents # a probability distribution, with all elements in [0, 1] summing to 1. is_sorted = (int(tf.reduce_all(tf.equal(elements, tf.sort(elements)))) if len(elements.shape) else 1) is_finite = int(tf.reduce_all(tf.math.is_finite(elements))) all_positive = int(positive == num_elements) all_nonnegative = int(positive + zeros == num_elements) all_negative = int(negative == num_elements) all_zero_one = int(zeros + ones == num_elements) all_probabilities = int(probabilities == num_elements) all_unique = int(unique == num_elements) features['booleans'] = [ is_sorted, is_finite, all_positive, all_nonnegative, all_negative, all_zero_one, all_probabilities, all_unique] features['value_string'] = [repr(value)] return features
def test_num_tensor_elements_using_content(self, content, expected_result): self.assertEqual( tf_coder_utils.num_tensor_elements(tf.constant(content)), expected_result)
def test_num_tensor_elements_using_shape(self, shape, expected_result): self.assertEqual(tf_coder_utils.num_tensor_elements(tf.ones(shape)), expected_result)
def num_elements(self): """Returns the number of elements in the wrapped value.""" if self.is_sparse_tensor: return tf_coder_utils.num_tensor_elements(self.value.values) else: return tf_coder_utils.num_tensor_elements(self.value)