def testElementwiseQuantileBucketsWithWeights(self, input_dtype):
        def analyzer_fn(inputs):
            return {
                'q_b':
                tft.quantiles(tf.cast(inputs['x'], input_dtype),
                              num_buckets=3,
                              epsilon=0.00001,
                              weights=inputs['weights'],
                              reduce_instance_dims=False)
            }

        input_data = [{
            'x': [[x, 2 * x], [2 * x, x]],
            'weights': [x / 100.]
        } for x in range(1, 3000)]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                [2, 2], tft_unit.canonical_numeric_dtype(input_dtype)),
            'weights':
            tf.io.FixedLenFeature([1], tf.float32)
        })
        # The expected data has 2 boundaries that divides the data into 3 buckets.
        expected_outputs = {
            'q_b':
            np.array(
                [[[1732, 2449], [3464, 4898]], [[3464, 4898], [1732, 2449]]],
                np.float32)
        }
        self.assertAnalyzerOutputs(input_data,
                                   input_metadata,
                                   analyzer_fn,
                                   expected_outputs,
                                   desired_batch_size=1000)
  def _assert_quantile_boundaries(self,
                                  test_inputs,
                                  expected_boundaries,
                                  input_dtype,
                                  num_buckets=None,
                                  num_expected_buckets=None):

    if not num_buckets:
      num_buckets = len(expected_boundaries) + 1
    if not num_expected_buckets:
      num_expected_buckets = num_buckets

    def analyzer_fn(inputs):
      x = tf.cast(inputs['x'], input_dtype)
      return {'q_b': tft.quantiles(x, num_buckets, epsilon=0.0001)}

    input_data = [{'x': [x]} for x in test_inputs]

    input_metadata = tft_unit.metadata_from_feature_spec({
        'x':
            tf.io.FixedLenFeature([1],
                                  tft_unit.canonical_numeric_dtype(input_dtype))
    })

    expected_data = {'q_b': expected_boundaries}

    self.assertAnalyzerOutputs(
        input_data,
        input_metadata,
        analyzer_fn,
        expected_data,
        desired_batch_size=1000)
    def testGaussianize(self, input_data, output_data, elementwise):
        def preprocessing_fn(inputs):
            x = inputs['x']
            x_cast = tf.cast(x, tf.as_dtype(input_data.dtype))
            x_gaussianized = tft.scale_to_gaussian(x_cast,
                                                   elementwise=elementwise)
            self.assertEqual(x_gaussianized.dtype,
                             tf.as_dtype(output_data.dtype))
            return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)}

        input_data_dicts = [{'x': x} for x in input_data]
        expected_data_dicts = [{
            'x_gaussianized': x_gaussianized
        } for x_gaussianized in output_data]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                input_data.shape[1:],
                tft_unit.canonical_numeric_dtype(tf.as_dtype(
                    input_data.dtype))),
        })
        expected_metadata = tft_unit.metadata_from_feature_spec({
            'x_gaussianized':
            tf.io.FixedLenFeature(output_data.shape[1:], tf.float32),
        })
        self.assertAnalyzeAndTransformResults(input_data_dicts,
                                              input_metadata,
                                              preprocessing_fn,
                                              expected_data_dicts,
                                              expected_metadata,
                                              desired_batch_size=20,
                                              beam_pipeline=beam.Pipeline())
  def testQuantileBuckets(self, input_dtype):

    def analyzer_fn(inputs):
      return {
          'q_b':
              tft.quantiles(
                  tf.cast(inputs['x'], input_dtype),
                  num_buckets=3,
                  epsilon=0.00001)
      }

    # NOTE: We force 3 batches: data has 3000 elements and we request a batch
    # size of 1000.
    input_data = [{'x': [x]} for x in range(1, 3000)]
    input_metadata = tft_unit.metadata_from_feature_spec({
        'x':
            tf.io.FixedLenFeature([1],
                                  tft_unit.canonical_numeric_dtype(input_dtype))
    })
    # The expected data has 2 boundaries that divides the data into 3 buckets.
    expected_outputs = {'q_b': np.array([[1000, 2000]], np.float32)}
    self.assertAnalyzerOutputs(
        input_data,
        input_metadata,
        analyzer_fn,
        expected_outputs,
        desired_batch_size=1000)
Ejemplo n.º 5
0
    def testGaussianizeSparse(self, input_dtype, elementwise):
        def preprocessing_fn(inputs):
            x_gaussianized = tf.sparse.to_dense(tft.scale_to_gaussian(
                tf.cast(inputs['x'], input_dtype), elementwise=elementwise),
                                                default_value=np.nan)
            x_gaussianized.set_shape([None, 4])
            self.assertEqual(x_gaussianized.dtype,
                             impl_test._mean_output_dtype(input_dtype))
            return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)}

        input_data_values = [
            516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16,
            508, 669, 617, 502, 532, 517, 479
        ]
        input_data = []
        for idx, v in enumerate(input_data_values):
            input_data.append({
                'idx': [0, 1],
                'val': [v] + [-input_data_values[-1 - idx]]
            })
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.SparseFeature('idx', 'val',
                                tft_unit.canonical_numeric_dtype(input_dtype),
                                4)
        })
        if elementwise:
            expected_data_values = [
                -0.09304726, -2.24682532, 1.56900163, -0.78244931, 0.48285998,
                0.47461339, -1.50929952, -0.39008015, 0.41659823, -0.81174337,
                0.54027596, 2.11624695, -1.72816411, -0.16046759, 1.13320023,
                0.74814557, -0.21014091, 0.04373742, -0.08454805, -0.39008015
            ]
        else:
            expected_data_values = [
                0.91555131, -1.54543642, 1.30767697, 0.73634456, 1.03620536,
                1.03443104, 0.26969729, 0.84990131, 1.02201077, 0.72569862,
                1.04862563, 1.49752966, -0.02838919, 0.90135672, 1.18702292,
                1.09475806, 0.89071077, 0.9439405, 0.91732564, 0.84990131
            ]
        expected_data = []
        for idx, v in enumerate(expected_data_values):
            expected_data.append({
                'x_gaussianized': ([v] + [-expected_data_values[-1 - idx]] +
                                   [float('nan'), float('nan')])
            })

        expected_metadata = tft_unit.metadata_from_feature_spec(
            {'x_gaussianized': tf.io.FixedLenFeature([4], tf.float32)})
        self.assertAnalyzeAndTransformResults(input_data,
                                              input_metadata,
                                              preprocessing_fn,
                                              expected_data,
                                              expected_metadata,
                                              desired_batch_size=20,
                                              beam_pipeline=beam.Pipeline())
    def testGaussianizeRagged(self, input_dtype):
        tft_unit.skip_if_not_tf2('RaggedFeature is not available in TF 1.x.')

        def preprocessing_fn(inputs):
            x_gaussianized = tft.scale_to_gaussian(
                tf.cast(inputs['x'], input_dtype))
            self.assertEqual(x_gaussianized.dtype,
                             impl_test._mean_output_dtype(input_dtype))
            return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)}

        input_data_values = [
            516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16,
            508, 669, 617, 502, 532, 517, 479
        ]
        input_data = []
        for idx, v in enumerate(input_data_values):
            input_data.append({
                'val': [v, -input_data_values[-1 - idx]],
                'row_lengths_1': [2, 1, 0],
                'row_lengths_2': [1, 0, 1],
            })
        input_metadata = tft.DatasetMetadata.from_feature_spec({
            'x':
            tf.io.RaggedFeature(
                tft_unit.canonical_numeric_dtype(input_dtype),
                value_key='val',
                partitions=[
                    tf.io.RaggedFeature.RowLengths('row_lengths_1'),  # pytype: disable=attribute-error
                    tf.io.RaggedFeature.RowLengths('row_lengths_2')  # pytype: disable=attribute-error
                ]),
        })
        expected_data_values = [
            0.91555131, -1.54543642, 1.30767697, 0.73634456, 1.03620536,
            1.03443104, 0.26969729, 0.84990131, 1.02201077, 0.72569862,
            1.04862563, 1.49752966, -0.02838919, 0.90135672, 1.18702292,
            1.09475806, 0.89071077, 0.9439405, 0.91732564, 0.84990131
        ]
        expected_data = []
        for idx, v in enumerate(expected_data_values):
            expected_data.append({
                'x_gaussianized$ragged_values':
                ([v, -expected_data_values[-1 - idx]]),
                'x_gaussianized$row_lengths_1': [2, 1, 0],
                'x_gaussianized$row_lengths_2': [1, 0, 1]
            })

        self.assertAnalyzeAndTransformResults(
            input_data,
            input_metadata,
            preprocessing_fn,
            expected_data,
            desired_batch_size=20,
            # Runs the test deterministically on the whole batch.
            beam_pipeline=beam.Pipeline())
    def _assert_quantile_boundaries(self,
                                    test_inputs,
                                    expected_boundaries,
                                    input_dtype,
                                    num_buckets=None,
                                    num_expected_buckets=None,
                                    always_return_num_quantiles=True):

        if not num_buckets:
            num_buckets = len(expected_boundaries) + 1
        if not num_expected_buckets:
            num_expected_buckets = num_buckets

        def preprocessing_fn(inputs):
            x = tf.cast(inputs['x'], input_dtype)
            quantiles = tft.quantiles(
                x,
                num_buckets,
                epsilon=0.0001,
                always_return_num_quantiles=always_return_num_quantiles)
            quantiles.set_shape([1, num_expected_buckets - 1])
            return {'q_b': quantiles}

        input_data = [{'x': [x]} for x in test_inputs]

        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                [1], tft_unit.canonical_numeric_dtype(input_dtype))
        })

        # Expected data has the same size as input, one bucket per input value.
        batch_size = 1000
        expected_data = []
        num_batches = int(math.ceil(len(test_inputs) / float(batch_size)))

        for _ in range(num_batches):
            expected_data += [{'q_b': expected_boundaries}]

        expected_metadata = None

        self.assertAnalyzeAndTransformResults(
            input_data,
            input_metadata,
            preprocessing_fn,
            expected_data,
            expected_metadata,
            desired_batch_size=batch_size,
            # TODO(b/110855155): Remove this explicit use of DirectRunner.
            beam_pipeline=beam.Pipeline())
    def testQuantileBucketsWithWeights(self, input_dtype, with_nans):
        def analyzer_fn(inputs):
            return {
                'q_b':
                tft.quantiles(tf.cast(inputs['x'], input_dtype),
                              num_buckets=3,
                              epsilon=0.00001,
                              weights=inputs['weights'])
            }

        input_data = [{
            'x': [x],
            'weights': [x / 100.]
        } for x in range(1, 3000)]
        if with_nans:
            input_data += [{
                'x': [np.nan],
                'weights': [100000]
            }, {
                'x': [100000],
                'weights': [np.nan]
            }]
        input_metadata = tft.DatasetMetadata.from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                [1], tft_unit.canonical_numeric_dtype(input_dtype)),
            'weights':
            tf.io.FixedLenFeature([1], tf.float32)
        })
        # The expected data has 2 boundaries that divides the data into 3 buckets.
        expected_outputs = {'q_b': np.array([[1732, 2449]], np.float32)}
        self.assertAnalyzerOutputs(input_data,
                                   input_metadata,
                                   analyzer_fn,
                                   expected_outputs,
                                   desired_batch_size=1000)
    def testBucketizationElementwise(self, test_inputs, expected_boundaries,
                                     do_shuffle, epsilon, should_apply,
                                     is_manual_boundaries, input_dtype):
        test_inputs = list(test_inputs)

        # Shuffle the input to add randomness to input generated with
        # simple range().
        if do_shuffle:
            random.shuffle(test_inputs)

        def preprocessing_fn(inputs):
            x = tf.cast(inputs['x'], input_dtype)

            num_buckets = len(expected_boundaries) + 1
            if should_apply:
                if is_manual_boundaries:
                    bucket_boundaries = [
                        expected_boundaries,
                        [2 * b for b in expected_boundaries]
                    ]
                else:
                    bucket_boundaries = tft.quantiles(
                        x, num_buckets, epsilon, reduce_instance_dims=False)
                    bucket_boundaries = tf.unstack(bucket_boundaries, axis=0)

                result = []
                for i, boundaries in enumerate(bucket_boundaries):
                    boundaries = tf.cast(boundaries, tf.float32)
                    result.append(
                        tft.apply_buckets(x[:, i],
                                          tf.expand_dims(boundaries, axis=0)))
                result = tf.stack(result, axis=1)

            else:
                result = tft.bucketize(x,
                                       num_buckets=num_buckets,
                                       epsilon=epsilon,
                                       elementwise=True)
            return {'q_b': result}

        input_data = [{'x': [x, 2 * x]} for x in test_inputs]

        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                [2], tft_unit.canonical_numeric_dtype(input_dtype))
        })

        # Sort the input based on value, index is used to create expected_data.
        sorted_list = sorted(enumerate(test_inputs), key=lambda p: p[1])

        # Expected data has the same size as input, one bucket per input value.
        expected_data = [[None, None]] * len(test_inputs)
        bucket = 0

        for (index, x) in sorted_list:
            # Increment the bucket number when crossing the boundary
            if (bucket < len(expected_boundaries)
                    and x >= expected_boundaries[bucket]):
                bucket += 1
            expected_data[index] = {'q_b': [bucket, bucket]}

        expected_metadata = tft_unit.metadata_from_feature_spec(
            {
                'q_b': tf.io.FixedLenFeature([2], tf.int64),
            }, None)

        @contextlib.contextmanager
        def no_assert():
            yield None

        assertion = no_assert()
        if input_dtype == tf.float16:
            assertion = self.assertRaisesRegexp(
                TypeError,
                '.*DataType float16 not in list of allowed values.*')

        with assertion:
            self.assertAnalyzeAndTransformResults(
                input_data,
                input_metadata,
                preprocessing_fn,
                expected_data,
                expected_metadata,
                desired_batch_size=1000,
                # TODO(b/110855155): Remove this explicit use of DirectRunner.
                beam_pipeline=beam.Pipeline())
  def testBucketization(self, test_inputs, expected_boundaries, do_shuffle,
                        epsilon, should_apply, is_manual_boundaries,
                        input_dtype):
    test_inputs = list(test_inputs)

    # Shuffle the input to add randomness to input generated with
    # simple range().
    if do_shuffle:
      random.shuffle(test_inputs)

    def preprocessing_fn(inputs):
      x = tf.cast(inputs['x'], input_dtype)
      num_buckets = len(expected_boundaries) + 1
      if should_apply:
        if is_manual_boundaries:
          bucket_boundaries = [expected_boundaries]
        else:
          bucket_boundaries = tft.quantiles(inputs['x'], num_buckets, epsilon)
        result = tft.apply_buckets(x, bucket_boundaries)
      else:
        result = tft.bucketize(x, num_buckets=num_buckets, epsilon=epsilon)
      return {'q_b': result}

    input_data = [{'x': [x]} for x in test_inputs]

    input_metadata = tft_unit.metadata_from_feature_spec({
        'x':
            tf.io.FixedLenFeature([1],
                                  tft_unit.canonical_numeric_dtype(input_dtype))
    })

    # Sort the input based on value, index is used to create expected_data.
    indexed_input = enumerate(test_inputs)

    sorted_list = sorted(indexed_input, key=lambda p: p[1])

    # Expected data has the same size as input, one bucket per input value.
    expected_data = [None] * len(test_inputs)
    bucket = 0
    for (index, x) in sorted_list:
      # Increment the bucket number when crossing the boundary
      if (bucket < len(expected_boundaries) and
          x >= expected_boundaries[bucket]):
        bucket += 1
      expected_data[index] = {'q_b': [bucket]}

    expected_metadata = tft_unit.metadata_from_feature_spec(
        {
            'q_b': tf.io.FixedLenFeature([1], tf.int64),
        }, {
            'q_b':
                schema_pb2.IntDomain(
                    min=0, max=len(expected_boundaries), is_categorical=True),
        })

    @contextlib.contextmanager
    def no_assert():
      yield None

    assertion = no_assert()
    if input_dtype == tf.float16:
      assertion = self.assertRaisesRegexp(
          TypeError, '.*DataType float16 not in list of allowed values.*')

    with assertion:
      self.assertAnalyzeAndTransformResults(
          input_data,
          input_metadata,
          preprocessing_fn,
          expected_data,
          expected_metadata,
          desired_batch_size=1000)
 def assert_and_cast_dtype(tensor, out_dtype):
     self.assertEqual(tensor.dtype, out_dtype)
     return tf.cast(tensor,
                    tft_unit.canonical_numeric_dtype(out_dtype))
    def testTukeyHHAnalyzersWithSparseInputs(self,
                                             input_dtype,
                                             output_dtypes,
                                             elementwise=True):
        def analyzer_fn(inputs):
            a = tf.cast(inputs['a'], input_dtype)

            def assert_and_cast_dtype(tensor, out_dtype):
                self.assertEqual(tensor.dtype, out_dtype)
                return tf.cast(tensor,
                               tft_unit.canonical_numeric_dtype(out_dtype))

            return {
                'tukey_location':
                assert_and_cast_dtype(
                    tft.tukey_location(a,
                                       reduce_instance_dims=not elementwise),
                    output_dtypes['tukey_location']),
                'tukey_scale':
                assert_and_cast_dtype(
                    tft.tukey_scale(a, reduce_instance_dims=not elementwise),
                    output_dtypes['tukey_scale']),
                'tukey_hl':
                assert_and_cast_dtype(
                    tft.tukey_h_params(
                        a, reduce_instance_dims=not elementwise)[0],
                    output_dtypes['tukey_hl']),
                'tukey_hr':
                assert_and_cast_dtype(
                    tft.tukey_h_params(
                        a, reduce_instance_dims=not elementwise)[1],
                    output_dtypes['tukey_hr']),
            }

        input_data_values = [
            516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16,
            508, 669, 617, 502, 532, 517, 479
        ]
        input_data = []
        for idx, v in enumerate(input_data_values):
            input_data.append({
                'idx0': [0, 0],
                'idx1': [0, 1],
                'val': [v, -input_data_values[-1 - idx]]
            })
        input_metadata = tft_unit.metadata_from_feature_spec({
            'a':
            tf.io.SparseFeature(['idx0', 'idx1'], 'val',
                                tft_unit.canonical_numeric_dtype(input_dtype),
                                (2, 2))
        })

        expected_outputs = {
            'tukey_location':
            np.array(
                [[526.89355, -526.89355], [0., 0.]] if elementwise else 0.0,
                tft_unit.canonical_numeric_dtype(
                    output_dtypes['tukey_location']).as_numpy_dtype),
            'tukey_scale':
            np.array([[116.73997, 116.73997], [1., 1.]]
                     if elementwise else 572.2776,
                     tft_unit.canonical_numeric_dtype(
                         output_dtypes['tukey_scale']).as_numpy_dtype),
            'tukey_hl':
            np.array(
                [[0.6629082, 0.11148566], [0., 0.]] if elementwise else 0.0,
                tft_unit.canonical_numeric_dtype(
                    output_dtypes['tukey_hl']).as_numpy_dtype),
            'tukey_hr':
            np.array(
                [[0.11148566, 0.6629082], [0., 0.]] if elementwise else 0.0,
                tft_unit.canonical_numeric_dtype(
                    output_dtypes['tukey_hr']).as_numpy_dtype),
        }

        self.assertAnalyzerOutputs(
            input_data,
            input_metadata,
            analyzer_fn,
            expected_outputs,
            desired_batch_size=20,
            # Runs the test deterministically on the whole batch.
            beam_pipeline=beam.Pipeline())
    def testTukeyHHAnalyzersWithRaggedInputs(self, input_dtype):
        tft_unit.skip_if_not_tf2('RaggedFeature is not available in TF 1.x.')

        output_dtype = impl_test._mean_output_dtype(input_dtype)
        canonical_output_dtype = tft_unit.canonical_numeric_dtype(output_dtype)

        def analyzer_fn(inputs):
            a = tf.cast(inputs['a'], input_dtype)

            def assert_and_cast_dtype(tensor):
                self.assertEqual(tensor.dtype, output_dtype)
                return tf.cast(tensor, canonical_output_dtype)

            return {
                'tukey_location': assert_and_cast_dtype(tft.tukey_location(a)),
                'tukey_scale': assert_and_cast_dtype(tft.tukey_scale(a)),
                'tukey_hl': assert_and_cast_dtype(tft.tukey_h_params(a)[0]),
                'tukey_hr': assert_and_cast_dtype(tft.tukey_h_params(a)[1]),
            }

        input_data_values = [
            516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16,
            508, 669, 617, 502, 532, 517, 479
        ]
        input_data = []
        for idx, v in enumerate(input_data_values):
            input_data.append({
                'val': [v, -input_data_values[-1 - idx]],
                'row_lengths_1': [2, 0, 1],
                'row_lengths_2': [0, 1, 1]
            })
        input_metadata = tft.DatasetMetadata.from_feature_spec({
            'a':
            tf.io.RaggedFeature(
                tft_unit.canonical_numeric_dtype(input_dtype),
                value_key='val',
                partitions=[
                    tf.io.RaggedFeature.RowLengths('row_lengths_1'),  # pytype: disable=attribute-error
                    tf.io.RaggedFeature.RowLengths('row_lengths_2')  # pytype: disable=attribute-error
                ]),
        })

        expected_outputs = {
            'tukey_location':
            np.array(0.0, canonical_output_dtype.as_numpy_dtype),
            'tukey_scale':
            np.array(572.2776, canonical_output_dtype.as_numpy_dtype),
            'tukey_hl':
            np.array(0.0, canonical_output_dtype.as_numpy_dtype),
            'tukey_hr':
            np.array(0.0, canonical_output_dtype.as_numpy_dtype),
        }

        self.assertAnalyzerOutputs(
            input_data,
            input_metadata,
            analyzer_fn,
            expected_outputs,
            desired_batch_size=20,
            # Runs the test deterministically on the whole batch.
            beam_pipeline=beam.Pipeline())