def testGaussianize(self, input_data, output_data, elementwise):
        def preprocessing_fn(inputs):
            x = inputs['x']
            x_cast = tf.cast(x, tf.as_dtype(input_data.dtype))
            x_gaussianized = tft.scale_to_gaussian(x_cast,
                                                   elementwise=elementwise)
            self.assertEqual(x_gaussianized.dtype,
                             tf.as_dtype(output_data.dtype))
            return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)}

        input_data_dicts = [{'x': x} for x in input_data]
        expected_data_dicts = [{
            'x_gaussianized': x_gaussianized
        } for x_gaussianized in output_data]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                input_data.shape[1:],
                tft_unit.canonical_numeric_dtype(tf.as_dtype(
                    input_data.dtype))),
        })
        expected_metadata = tft_unit.metadata_from_feature_spec({
            'x_gaussianized':
            tf.io.FixedLenFeature(output_data.shape[1:], tf.float32),
        })
        self.assertAnalyzeAndTransformResults(input_data_dicts,
                                              input_metadata,
                                              preprocessing_fn,
                                              expected_data_dicts,
                                              expected_metadata,
                                              desired_batch_size=20,
                                              beam_pipeline=beam.Pipeline())
Ejemplo n.º 2
0
    def testBasicType(self):
        config = {
            'timesteps': 3,
            'time_features': [],
            'features': ['a'],
            'enable_timestamp_features': False
        }

        input_data = [{'a': [1000.0, 2000.0, 3000.0]}]
        input_metadata = tft_unit.metadata_from_feature_spec(
            {'a': tf.io.VarLenFeature(tf.float32)})

        output = [[1000], [2000], [3000]]

        output = stats.zscore(output)

        expected_data = [{'Float32': output, 'LABEL': output}]

        expected_metadata = tft_unit.metadata_from_feature_spec({
            'Float32':
            tf.io.FixedLenFeature([config['timesteps'], 1], tf.float32),
            'LABEL':
            tf.io.FixedLenFeature([config['timesteps'], 1], tf.float32)
        })

        preprocessing_fn = functools.partial(
            encoder_decoder_preprocessing.preprocessing_fn,
            custom_config=config)

        self.assertAnalyzeAndTransformResults(input_data, input_metadata,
                                              preprocessing_fn, expected_data,
                                              expected_metadata)
Ejemplo n.º 3
0
    def testWithTimeStamps(self):

        config = {
            'timesteps': 2,
            'time_features': ['MINUTE', 'MONTH', 'HOUR', 'DAY', 'YEAR'],
            'features': ['float32', 'foo_TIMESTAMP'],
            'enable_timestamp_features': True
        }

        # The values will need to be different enough for the zscore not to nan
        timestamp_1 = int(datetime(2000, 1, 1, 0, 0, 0).timestamp())
        timestamp_2 = int(datetime(2001, 6, 15, 12, 30, 30).timestamp())

        input_data = [{
            'float32': [1000.0, 2000.0],
            'foo_TIMESTAMP': [timestamp_1 * 1000, timestamp_2 * 1000]
        }]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'float32':
            tf.io.VarLenFeature(tf.float32),
            'foo_TIMESTAMP':
            tf.io.VarLenFeature(tf.int64)
        })

        output_timestep_1 = self.create_transform_output(timestamp_1)

        output_timestep_2 = self.create_transform_output(timestamp_2)

        for i in range(len(output_timestep_1)):
            values = stats.zscore([output_timestep_1[i], output_timestep_2[i]])
            n = numpy.isnan(values)
            values[n] = 0.0
            output_timestep_1[i] = values[0]
            output_timestep_2[i] = values[1]

        values = stats.zscore([1000.0, 2000.0])

        output_timestep_1.insert(0, values[0])
        output_timestep_2.insert(0, values[1])

        output = [output_timestep_1, output_timestep_2]

        expected_data = [{'Float32': output, 'LABEL': output}]

        expected_metadata = tft_unit.metadata_from_feature_spec({
            'Float32':
            tf.io.FixedLenFeature([config['timesteps'], 11], tf.float32),
            'LABEL':
            tf.io.FixedLenFeature([config['timesteps'], 11], tf.float32)
        })

        preprocessing_fn = functools.partial(
            encoder_decoder_preprocessing.preprocessing_fn,
            custom_config=config)

        self.assertAnalyzeAndTransformResults(input_data, input_metadata,
                                              preprocessing_fn, expected_data,
                                              expected_metadata)
Ejemplo n.º 4
0
    def testGaussianizeSparse(self, input_dtype, elementwise):
        def preprocessing_fn(inputs):
            x_gaussianized = tf.sparse.to_dense(tft.scale_to_gaussian(
                tf.cast(inputs['x'], input_dtype), elementwise=elementwise),
                                                default_value=np.nan)
            x_gaussianized.set_shape([None, 4])
            self.assertEqual(x_gaussianized.dtype,
                             impl_test._mean_output_dtype(input_dtype))
            return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)}

        input_data_values = [
            516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16,
            508, 669, 617, 502, 532, 517, 479
        ]
        input_data = []
        for idx, v in enumerate(input_data_values):
            input_data.append({
                'idx': [0, 1],
                'val': [v] + [-input_data_values[-1 - idx]]
            })
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.SparseFeature('idx', 'val',
                                tft_unit.canonical_numeric_dtype(input_dtype),
                                4)
        })
        if elementwise:
            expected_data_values = [
                -0.09304726, -2.24682532, 1.56900163, -0.78244931, 0.48285998,
                0.47461339, -1.50929952, -0.39008015, 0.41659823, -0.81174337,
                0.54027596, 2.11624695, -1.72816411, -0.16046759, 1.13320023,
                0.74814557, -0.21014091, 0.04373742, -0.08454805, -0.39008015
            ]
        else:
            expected_data_values = [
                0.91555131, -1.54543642, 1.30767697, 0.73634456, 1.03620536,
                1.03443104, 0.26969729, 0.84990131, 1.02201077, 0.72569862,
                1.04862563, 1.49752966, -0.02838919, 0.90135672, 1.18702292,
                1.09475806, 0.89071077, 0.9439405, 0.91732564, 0.84990131
            ]
        expected_data = []
        for idx, v in enumerate(expected_data_values):
            expected_data.append({
                'x_gaussianized': ([v] + [-expected_data_values[-1 - idx]] +
                                   [float('nan'), float('nan')])
            })

        expected_metadata = tft_unit.metadata_from_feature_spec(
            {'x_gaussianized': tf.io.FixedLenFeature([4], tf.float32)})
        self.assertAnalyzeAndTransformResults(input_data,
                                              input_metadata,
                                              preprocessing_fn,
                                              expected_data,
                                              expected_metadata,
                                              desired_batch_size=20,
                                              beam_pipeline=beam.Pipeline())
    def testBucketizePerKey(self):
        def preprocessing_fn(inputs):
            x_bucketized = tft.bucketize_per_key(inputs['x'],
                                                 inputs['key'],
                                                 num_buckets=3,
                                                 epsilon=0.00001)
            return {'x': inputs['x'], 'x_bucketized': x_bucketized}

        # NOTE: We force 10 batches: data has 100 elements and we request a batch
        # size of 10.
        input_data = [{
            'x': x,
            'key': 'a' if x < 50 else 'b'
        } for x in range(1, 100)]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature([], tf.float32),
            'key':
            tf.io.FixedLenFeature([], tf.string)
        })

        def compute_quantile(instance):
            if instance['key'] == 'a':
                if instance['x'] < 17:
                    return 0
                elif instance['x'] < 33:
                    return 1
                else:
                    return 2
            else:
                if instance['x'] < 66:
                    return 0
                elif instance['x'] < 83:
                    return 1
                else:
                    return 2

        expected_data = [{
            'x_bucketized': compute_quantile(instance),
            'x': instance['x']
        } for instance in input_data]
        expected_metadata = tft_unit.metadata_from_feature_spec(
            {
                'x': tf.io.FixedLenFeature([], tf.float32),
                'x_bucketized': tf.io.FixedLenFeature([], tf.int64),
            }, {
                'x_bucketized':
                schema_pb2.IntDomain(min=0, max=2, is_categorical=True),
            })
        self.assertAnalyzeAndTransformResults(input_data,
                                              input_metadata,
                                              preprocessing_fn,
                                              expected_data,
                                              expected_metadata,
                                              desired_batch_size=10)
  def testQuantileBuckets(self, input_dtype):

    def analyzer_fn(inputs):
      return {
          'q_b':
              tft.quantiles(
                  tf.cast(inputs['x'], input_dtype),
                  num_buckets=3,
                  epsilon=0.00001)
      }

    # NOTE: We force 3 batches: data has 3000 elements and we request a batch
    # size of 1000.
    input_data = [{'x': [x]} for x in range(1, 3000)]
    input_metadata = tft_unit.metadata_from_feature_spec({
        'x':
            tf.io.FixedLenFeature([1],
                                  tft_unit.canonical_numeric_dtype(input_dtype))
    })
    # The expected data has 2 boundaries that divides the data into 3 buckets.
    expected_outputs = {'q_b': np.array([[1000, 2000]], np.float32)}
    self.assertAnalyzerOutputs(
        input_data,
        input_metadata,
        analyzer_fn,
        expected_outputs,
        desired_batch_size=1000)
    def testElementwiseQuantileBucketsWithWeights(self, input_dtype):
        def analyzer_fn(inputs):
            return {
                'q_b':
                tft.quantiles(tf.cast(inputs['x'], input_dtype),
                              num_buckets=3,
                              epsilon=0.00001,
                              weights=inputs['weights'],
                              reduce_instance_dims=False)
            }

        input_data = [{
            'x': [[x, 2 * x], [2 * x, x]],
            'weights': [x / 100.]
        } for x in range(1, 3000)]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                [2, 2], tft_unit.canonical_numeric_dtype(input_dtype)),
            'weights':
            tf.io.FixedLenFeature([1], tf.float32)
        })
        # The expected data has 2 boundaries that divides the data into 3 buckets.
        expected_outputs = {
            'q_b':
            np.array(
                [[[1732, 2449], [3464, 4898]], [[3464, 4898], [1732, 2449]]],
                np.float32)
        }
        self.assertAnalyzerOutputs(input_data,
                                   input_metadata,
                                   analyzer_fn,
                                   expected_outputs,
                                   desired_batch_size=1000)
Ejemplo n.º 8
0
    def testBucketizeSparseInput(self):
        def preprocessing_fn(inputs):
            return {
                'x_bucketized':
                tft.bucketize(inputs['x'], num_buckets=3, epsilon=0.00001)
            }

        input_data = [{
            'val': [x],
            'idx0': [x % 4],
            'idx1': [x % 5]
        } for x in range(1, 10)]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.SparseFeature(['idx0', 'idx1'], 'val', tf.float32, [4, 5]),
        })

        def compute_bucket(instance):
            if instance['val'][0] < 4:
                return 0
            if instance['val'][0] < 7:
                return 1
            return 2

        expected_data = [{
            'x_bucketized$sparse_values': [compute_bucket(instance)],
            'x_bucketized$sparse_indices_0':
            instance['idx0'],
            'x_bucketized$sparse_indices_1':
            instance['idx1']
        } for instance in input_data]
        self.assertAnalyzeAndTransformResults(input_data, input_metadata,
                                              preprocessing_fn, expected_data)
  def _assert_quantile_boundaries(self,
                                  test_inputs,
                                  expected_boundaries,
                                  input_dtype,
                                  num_buckets=None,
                                  num_expected_buckets=None):

    if not num_buckets:
      num_buckets = len(expected_boundaries) + 1
    if not num_expected_buckets:
      num_expected_buckets = num_buckets

    def analyzer_fn(inputs):
      x = tf.cast(inputs['x'], input_dtype)
      return {'q_b': tft.quantiles(x, num_buckets, epsilon=0.0001)}

    input_data = [{'x': [x]} for x in test_inputs]

    input_metadata = tft_unit.metadata_from_feature_spec({
        'x':
            tf.io.FixedLenFeature([1],
                                  tft_unit.canonical_numeric_dtype(input_dtype))
    })

    expected_data = {'q_b': expected_boundaries}

    self.assertAnalyzerOutputs(
        input_data,
        input_metadata,
        analyzer_fn,
        expected_data,
        desired_batch_size=1000)
    def testTukeyHHAnalyzersWithNDDenseInputs(self):
        def analyzer_fn(inputs):
            a = inputs['a']

            return {
                'tukey_location':
                tft.tukey_location(a, reduce_instance_dims=False),
                'tukey_scale': tft.tukey_scale(a, reduce_instance_dims=False),
                'tukey_hl': tft.tukey_h_params(a,
                                               reduce_instance_dims=False)[0],
                'tukey_hr': tft.tukey_h_params(a,
                                               reduce_instance_dims=False)[1],
            }

        input_data_values = [
            516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16,
            508, 669, 617, 502, 532, 517, 479
        ]
        input_data = []
        for idx, v in enumerate(input_data_values):
            input_data.append({
                'a': [[v, -input_data_values[-1 - idx]],
                      [2 * v, -2 * input_data_values[-1 - idx]]]
            })
        input_metadata = tft_unit.metadata_from_feature_spec(
            {'a': tf.io.FixedLenFeature([2, 2], tf.float32)})
        expected_outputs = {
            'tukey_location':
            np.array(
                [[526.89355, -526.89355], [2. * 526.89355, -2. * 526.89355]],
                np.float32),
            'tukey_scale':
            np.array(
                [[116.73997, 116.73997], [2. * 116.73997, 2. * 116.73997]],
                np.float32),
            'tukey_hl':
            np.array([[0.6629082, 0.11148566], [0.6629082, 0.11148566]],
                     np.float32),
            'tukey_hr':
            np.array([[0.11148566, 0.6629082], [0.11148566, 0.6629082]],
                     np.float32)
        }

        self.assertAnalyzerOutputs(
            input_data,
            input_metadata,
            analyzer_fn,
            expected_outputs,
            desired_batch_size=20,
            # Runs the test deterministically on the whole batch.
            beam_pipeline=beam.Pipeline())
    def _assert_quantile_boundaries(self,
                                    test_inputs,
                                    expected_boundaries,
                                    input_dtype,
                                    num_buckets=None,
                                    num_expected_buckets=None,
                                    always_return_num_quantiles=True):

        if not num_buckets:
            num_buckets = len(expected_boundaries) + 1
        if not num_expected_buckets:
            num_expected_buckets = num_buckets

        def preprocessing_fn(inputs):
            x = tf.cast(inputs['x'], input_dtype)
            quantiles = tft.quantiles(
                x,
                num_buckets,
                epsilon=0.0001,
                always_return_num_quantiles=always_return_num_quantiles)
            quantiles.set_shape([1, num_expected_buckets - 1])
            return {'q_b': quantiles}

        input_data = [{'x': [x]} for x in test_inputs]

        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                [1], tft_unit.canonical_numeric_dtype(input_dtype))
        })

        # Expected data has the same size as input, one bucket per input value.
        batch_size = 1000
        expected_data = []
        num_batches = int(math.ceil(len(test_inputs) / float(batch_size)))

        for _ in range(num_batches):
            expected_data += [{'q_b': expected_boundaries}]

        expected_metadata = None

        self.assertAnalyzeAndTransformResults(
            input_data,
            input_metadata,
            preprocessing_fn,
            expected_data,
            expected_metadata,
            desired_batch_size=batch_size,
            # TODO(b/110855155): Remove this explicit use of DirectRunner.
            beam_pipeline=beam.Pipeline())
    def testQuantilesPerKey(self):
        def analyzer_fn(inputs):
            key_vocab, q_b, scale_factor_per_key, shift_per_key, num_buckets = (
                analyzers._quantiles_per_key(inputs['x'],
                                             inputs['key'],
                                             num_buckets=3,
                                             epsilon=0.00001))
            return {
                'key_vocab': key_vocab,
                'q_b': q_b,
                'scale_factor_per_key': scale_factor_per_key,
                'shift_per_key': shift_per_key,
                'num_buckets': num_buckets,
            }

        # NOTE: We force 10 batches: data has 100 elements and we request a batch
        # size of 10.
        input_data = [{
            'x': [x],
            'key': 'a' if x < 50 else 'b'
        } for x in range(1, 100)]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature([1], tf.int64),
            'key':
            tf.io.FixedLenFeature([], tf.string)
        })
        # The expected data has 2 boundaries that divides the data into 3 buckets.
        expected_outputs = {
            'key_vocab': np.array([b'a', b'b'], np.object),
            'q_b': np.array([0., 1., 2.], np.float32),
            'scale_factor_per_key': np.array([0.0625, 0.05882353], np.float32),
            'shift_per_key': np.array([-1.0625, -2.88235283], np.float32),
            'num_buckets': np.array(3, np.int64),
        }
        self.assertAnalyzerOutputs(input_data,
                                   input_metadata,
                                   analyzer_fn,
                                   expected_outputs,
                                   desired_batch_size=10)
  def testBucketization(self, test_inputs, expected_boundaries, do_shuffle,
                        epsilon, should_apply, is_manual_boundaries,
                        input_dtype):
    test_inputs = list(test_inputs)

    # Shuffle the input to add randomness to input generated with
    # simple range().
    if do_shuffle:
      random.shuffle(test_inputs)

    def preprocessing_fn(inputs):
      x = tf.cast(inputs['x'], input_dtype)
      num_buckets = len(expected_boundaries) + 1
      if should_apply:
        if is_manual_boundaries:
          bucket_boundaries = [expected_boundaries]
        else:
          bucket_boundaries = tft.quantiles(inputs['x'], num_buckets, epsilon)
        result = tft.apply_buckets(x, bucket_boundaries)
      else:
        result = tft.bucketize(x, num_buckets=num_buckets, epsilon=epsilon)
      return {'q_b': result}

    input_data = [{'x': [x]} for x in test_inputs]

    input_metadata = tft_unit.metadata_from_feature_spec({
        'x':
            tf.io.FixedLenFeature([1],
                                  tft_unit.canonical_numeric_dtype(input_dtype))
    })

    # Sort the input based on value, index is used to create expected_data.
    indexed_input = enumerate(test_inputs)

    sorted_list = sorted(indexed_input, key=lambda p: p[1])

    # Expected data has the same size as input, one bucket per input value.
    expected_data = [None] * len(test_inputs)
    bucket = 0
    for (index, x) in sorted_list:
      # Increment the bucket number when crossing the boundary
      if (bucket < len(expected_boundaries) and
          x >= expected_boundaries[bucket]):
        bucket += 1
      expected_data[index] = {'q_b': [bucket]}

    expected_metadata = tft_unit.metadata_from_feature_spec(
        {
            'q_b': tf.io.FixedLenFeature([1], tf.int64),
        }, {
            'q_b':
                schema_pb2.IntDomain(
                    min=0, max=len(expected_boundaries), is_categorical=True),
        })

    @contextlib.contextmanager
    def no_assert():
      yield None

    assertion = no_assert()
    if input_dtype == tf.float16:
      assertion = self.assertRaisesRegexp(
          TypeError, '.*DataType float16 not in list of allowed values.*')

    with assertion:
      self.assertAnalyzeAndTransformResults(
          input_data,
          input_metadata,
          preprocessing_fn,
          expected_data,
          expected_metadata,
          desired_batch_size=1000)
    def testBucketizePerKeyWithInfrequentKeys(self):
        def preprocessing_fn(inputs):
            x_bucketized = tft.bucketize_per_key(inputs['x'],
                                                 inputs['key'],
                                                 num_buckets=4,
                                                 epsilon=0.00001)
            return {'x': inputs['x'], 'x_bucketized': x_bucketized}

        input_data = [{
            'x': [],
            'key': []
        }, {
            'x': [5, 6],
            'key': ['a', 'a']
        }, {
            'x': [7],
            'key': ['a']
        }, {
            'x': [12],
            'key': ['b']
        }, {
            'x': [13],
            'key': ['b']
        }, {
            'x': [15],
            'key': ['c']
        }, {
            'x': [2],
            'key': ['d']
        }, {
            'x': [4],
            'key': ['d']
        }, {
            'x': [6],
            'key': ['d']
        }, {
            'x': [8],
            'key': ['d']
        }, {
            'x': [2],
            'key': ['e']
        }, {
            'x': [4],
            'key': ['e']
        }, {
            'x': [6],
            'key': ['e']
        }, {
            'x': [8],
            'key': ['e']
        }, {
            'x': [10],
            'key': ['e']
        }, {
            'x': [11],
            'key': ['e']
        }, {
            'x': [12],
            'key': ['e']
        }, {
            'x': [13],
            'key': ['e']
        }]  # pyformat: disable
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.VarLenFeature(tf.float32),
            'key':
            tf.io.VarLenFeature(tf.string)
        })
        expected_data = [{
            'x': [],
            'x_bucketized': []
        }, {
            'x': [5, 6],
            'x_bucketized': [1, 2]
        }, {
            'x': [7],
            'x_bucketized': [3]
        }, {
            'x': [12],
            'x_bucketized': [1]
        }, {
            'x': [13],
            'x_bucketized': [3]
        }, {
            'x': [15],
            'x_bucketized': [1]
        }, {
            'x': [2],
            'x_bucketized': [0]
        }, {
            'x': [4],
            'x_bucketized': [1]
        }, {
            'x': [6],
            'x_bucketized': [2]
        }, {
            'x': [8],
            'x_bucketized': [3]
        }, {
            'x': [2],
            'x_bucketized': [0]
        }, {
            'x': [4],
            'x_bucketized': [0]
        }, {
            'x': [6],
            'x_bucketized': [1]
        }, {
            'x': [8],
            'x_bucketized': [1]
        }, {
            'x': [10],
            'x_bucketized': [2]
        }, {
            'x': [11],
            'x_bucketized': [2]
        }, {
            'x': [12],
            'x_bucketized': [3]
        }, {
            'x': [13],
            'x_bucketized': [2]
        }]  # pyformat: disable
        expected_metadata = tft_unit.metadata_from_feature_spec(
            {
                'x': tf.io.VarLenFeature(tf.float32),
                'x_bucketized': tf.io.VarLenFeature(tf.int64),
            }, {
                'x_bucketized':
                schema_pb2.IntDomain(min=0, max=3, is_categorical=True),
            })
        self.assertAnalyzeAndTransformResults(input_data,
                                              input_metadata,
                                              preprocessing_fn,
                                              expected_data,
                                              expected_metadata,
                                              desired_batch_size=10)
Ejemplo n.º 15
0
    def testBucketizePerKeySparse(self):
        def preprocessing_fn(inputs):
            x_bucketized = tft.bucketize_per_key(inputs['x'],
                                                 inputs['key'],
                                                 num_buckets=3,
                                                 epsilon=0.00001)
            return {'x_bucketized': x_bucketized}

        # NOTE: We force 10 batches: data has 100 elements and we request a batch
        # size of 10.
        input_data = [{
            'x': [x],
            'idx0': [0],
            'idx1': [0],
            'key': ['a'] if x < 50 else ['b']
        } for x in range(1, 100)]
        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.SparseFeature(['idx0', 'idx1'], 'x', tf.float32, (2, 2)),
            'key':
            tf.io.VarLenFeature(tf.string)
        })

        def compute_bucket(instance):
            if instance['key'][0] == 'a':
                if instance['x'][0] < 17:
                    return 0
                elif instance['x'][0] < 33:
                    return 1
                else:
                    return 2
            else:
                if instance['x'][0] < 66:
                    return 0
                elif instance['x'][0] < 83:
                    return 1
                else:
                    return 2

        expected_data = [{
            'x_bucketized$sparse_values': [compute_bucket(instance)],
            'x_bucketized$sparse_indices_0': [0],
            'x_bucketized$sparse_indices_1': [0],
        } for instance in input_data]
        expected_metadata = tft_unit.metadata_from_feature_spec(
            {
                'x_bucketized':
                tf.io.SparseFeature([
                    'x_bucketized$sparse_indices_0',
                    'x_bucketized$sparse_indices_1'
                ],
                                    'x_bucketized$sparse_values',
                                    tf.int64, (None, None),
                                    already_sorted=True),
            }, {
                'x_bucketized':
                schema_pb2.IntDomain(min=0, max=2, is_categorical=True),
            })
        self.assertAnalyzeAndTransformResults(input_data,
                                              input_metadata,
                                              preprocessing_fn,
                                              expected_data,
                                              expected_metadata,
                                              desired_batch_size=10)
    def testTukeyHHAnalyzersWithSparseInputs(self,
                                             input_dtype,
                                             output_dtypes,
                                             elementwise=True):
        def analyzer_fn(inputs):
            a = tf.cast(inputs['a'], input_dtype)

            def assert_and_cast_dtype(tensor, out_dtype):
                self.assertEqual(tensor.dtype, out_dtype)
                return tf.cast(tensor,
                               tft_unit.canonical_numeric_dtype(out_dtype))

            return {
                'tukey_location':
                assert_and_cast_dtype(
                    tft.tukey_location(a,
                                       reduce_instance_dims=not elementwise),
                    output_dtypes['tukey_location']),
                'tukey_scale':
                assert_and_cast_dtype(
                    tft.tukey_scale(a, reduce_instance_dims=not elementwise),
                    output_dtypes['tukey_scale']),
                'tukey_hl':
                assert_and_cast_dtype(
                    tft.tukey_h_params(
                        a, reduce_instance_dims=not elementwise)[0],
                    output_dtypes['tukey_hl']),
                'tukey_hr':
                assert_and_cast_dtype(
                    tft.tukey_h_params(
                        a, reduce_instance_dims=not elementwise)[1],
                    output_dtypes['tukey_hr']),
            }

        input_data_values = [
            516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16,
            508, 669, 617, 502, 532, 517, 479
        ]
        input_data = []
        for idx, v in enumerate(input_data_values):
            input_data.append({
                'idx0': [0, 0],
                'idx1': [0, 1],
                'val': [v, -input_data_values[-1 - idx]]
            })
        input_metadata = tft_unit.metadata_from_feature_spec({
            'a':
            tf.io.SparseFeature(['idx0', 'idx1'], 'val',
                                tft_unit.canonical_numeric_dtype(input_dtype),
                                (2, 2))
        })

        expected_outputs = {
            'tukey_location':
            np.array(
                [[526.89355, -526.89355], [0., 0.]] if elementwise else 0.0,
                tft_unit.canonical_numeric_dtype(
                    output_dtypes['tukey_location']).as_numpy_dtype),
            'tukey_scale':
            np.array([[116.73997, 116.73997], [1., 1.]]
                     if elementwise else 572.2776,
                     tft_unit.canonical_numeric_dtype(
                         output_dtypes['tukey_scale']).as_numpy_dtype),
            'tukey_hl':
            np.array(
                [[0.6629082, 0.11148566], [0., 0.]] if elementwise else 0.0,
                tft_unit.canonical_numeric_dtype(
                    output_dtypes['tukey_hl']).as_numpy_dtype),
            'tukey_hr':
            np.array(
                [[0.11148566, 0.6629082], [0., 0.]] if elementwise else 0.0,
                tft_unit.canonical_numeric_dtype(
                    output_dtypes['tukey_hr']).as_numpy_dtype),
        }

        self.assertAnalyzerOutputs(
            input_data,
            input_metadata,
            analyzer_fn,
            expected_outputs,
            desired_batch_size=20,
            # Runs the test deterministically on the whole batch.
            beam_pipeline=beam.Pipeline())
    def testBucketizationElementwise(self, test_inputs, expected_boundaries,
                                     do_shuffle, epsilon, should_apply,
                                     is_manual_boundaries, input_dtype):
        test_inputs = list(test_inputs)

        # Shuffle the input to add randomness to input generated with
        # simple range().
        if do_shuffle:
            random.shuffle(test_inputs)

        def preprocessing_fn(inputs):
            x = tf.cast(inputs['x'], input_dtype)

            num_buckets = len(expected_boundaries) + 1
            if should_apply:
                if is_manual_boundaries:
                    bucket_boundaries = [
                        expected_boundaries,
                        [2 * b for b in expected_boundaries]
                    ]
                else:
                    bucket_boundaries = tft.quantiles(
                        x, num_buckets, epsilon, reduce_instance_dims=False)
                    bucket_boundaries = tf.unstack(bucket_boundaries, axis=0)

                result = []
                for i, boundaries in enumerate(bucket_boundaries):
                    boundaries = tf.cast(boundaries, tf.float32)
                    result.append(
                        tft.apply_buckets(x[:, i],
                                          tf.expand_dims(boundaries, axis=0)))
                result = tf.stack(result, axis=1)

            else:
                result = tft.bucketize(x,
                                       num_buckets=num_buckets,
                                       epsilon=epsilon,
                                       elementwise=True)
            return {'q_b': result}

        input_data = [{'x': [x, 2 * x]} for x in test_inputs]

        input_metadata = tft_unit.metadata_from_feature_spec({
            'x':
            tf.io.FixedLenFeature(
                [2], tft_unit.canonical_numeric_dtype(input_dtype))
        })

        # Sort the input based on value, index is used to create expected_data.
        sorted_list = sorted(enumerate(test_inputs), key=lambda p: p[1])

        # Expected data has the same size as input, one bucket per input value.
        expected_data = [[None, None]] * len(test_inputs)
        bucket = 0

        for (index, x) in sorted_list:
            # Increment the bucket number when crossing the boundary
            if (bucket < len(expected_boundaries)
                    and x >= expected_boundaries[bucket]):
                bucket += 1
            expected_data[index] = {'q_b': [bucket, bucket]}

        expected_metadata = tft_unit.metadata_from_feature_spec(
            {
                'q_b': tf.io.FixedLenFeature([2], tf.int64),
            }, None)

        @contextlib.contextmanager
        def no_assert():
            yield None

        assertion = no_assert()
        if input_dtype == tf.float16:
            assertion = self.assertRaisesRegexp(
                TypeError,
                '.*DataType float16 not in list of allowed values.*')

        with assertion:
            self.assertAnalyzeAndTransformResults(
                input_data,
                input_metadata,
                preprocessing_fn,
                expected_data,
                expected_metadata,
                desired_batch_size=1000,
                # TODO(b/110855155): Remove this explicit use of DirectRunner.
                beam_pipeline=beam.Pipeline())