def testGetBroadcastableColumnNotFound(self):
     with self.assertRaisesRegex(
             ValueError, r'Column "w" not present in the input table\.'):
         arrow_util.get_broadcastable_column(pa.Table.from_arrays(
             [pa.array([[1], [2]]),
              pa.array([[1], [3]])], ["u", "v"]),
                                             column_name="w")
 def testGetArrayBroadcastString(self):
     table = pa.Table.from_arrays([
         pa.array([[{
             "sf": [
                 {
                     "ssf": [[1]]
                 },
                 {
                     "ssf": [[2]]
                 },
             ]
         }], [{
             "sf": [
                 {
                     "ssf": [[3], [4]]
                 },
             ]
         }]]),
         pa.array([["one"], ["two"]])
     ], ["f", "w"])
     feature = types.FeaturePath(["f", "sf", "ssf"])
     actual_arr, actual_weights = arrow_util.get_array(
         table, feature, broadcast_column_name="w")
     expected_arr = pa.array([[[1]], [[2]], [[3], [4]]])
     expected_weights = np.array(["one", "one", "two"])
     self.assertTrue(
         actual_arr.equals(expected_arr),
         "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
             feature, expected_arr, actual_arr))
     np.testing.assert_array_equal(expected_weights, actual_weights)
 def test_lift_min_x_count_filters_all(self):
     examples = [
         pa.Table.from_arrays([
             pa.array([['a'], ['a'], ['b'], ['a']]),
             pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
         ], ['categorical_x', 'string_y']),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'categorical_x'
       type: BYTES
     }
     feature {
       name: 'string_y'
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_result = []
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema,
         y_path=types.FeaturePath(['string_y']),
         min_x_count=4)
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
Beispiel #4
0
 def testGetArrayReturnExampleIndices(self):
     table = pa.Table.from_arrays([
         pa.array([[{
             "sf": [{
                 "ssf": [1]
             }, {
                 "ssf": [2]
             }]
         }], [{
             "sf": [{
                 "ssf": [3, 4]
             }]
         }]]),
         pa.array([["one"], ["two"]])
     ], ["f", "w"])
     feature = types.FeaturePath(["f", "sf", "ssf"])
     actual_arr, actual_indices = arrow_util.get_array(
         table, feature, return_example_indices=True)
     expected_arr = pa.array([[1], [2], [3, 4]])
     expected_indices = np.array([0, 0, 1])
     self.assertTrue(
         actual_arr.equals(expected_arr),
         "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
             feature, expected_arr, actual_arr))
     np.testing.assert_array_equal(expected_indices, actual_indices)
    def test_nl_generator_values_threshold_check(self):
        """Tests generator values threshold with fake heuristic."""
        # Expected to give 6 matches.
        input_batches = [
            pa.Column.from_array(
                'feature', pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH']])),
            pa.Column.from_array('feature', pa.array([['MATCH', 'MATCH']])),
            # Nones should be ignored.
            pa.Column.from_array('feature', pa.array([None, None])),
        ]
        # Try generators with values_threshold=7 (should not create stats) and
        # 6 (should create stats)
        generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=7)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())

        generator = nlsg.NLStatsGenerator(_FakeHeuristic(), values_threshold=6)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info', str='natural_language_domain {}'),
                statistics_pb2.CustomStatistic(
                    name='natural_language_match_rate', num=1.0)
            ]))
 def test_mi_with_invalid_features(self):
     batch = pa.Table.from_arrays(
         [pa.array([[1]]), pa.array([[1, 2]])],
         ["label_key", "multivalent_feature"])
     schema = text_format.Parse(
         """
     feature {
       name: "label_key"
       type: INT
       shape {
         dim {
           size: 1
         }
       }
     }
     feature {
       name: "multivalent_feature"
       type: INT
       value_count: {
         min: 2
         max: 2
       }
     }
     """, schema_pb2.Schema())
     with self.assertRaisesRegexp(ValueError, "Found array with 0 sample"):
         sklearn_mutual_information.SkLearnMutualInformation(
             types.FeaturePath(["label_key"]), schema,
             TEST_SEED).compute(batch)
    def test_nl_generator_avg_word_heuristic_match(self):
        """Tests generator with avg word length heuristic."""
        generator = nlsg.NLStatsGenerator(values_threshold=2)
        input_batches = [
            pa.Column.from_array(
                'feature',
                pa.array([[
                    'This looks correct.', 'This one too, it should be text.'
                ], ['xosuhddsofuhg123fdgosh']])),
            pa.Column.from_array(
                'feature',
                pa.array(
                    [['This should be text as well',
                      'Here is another text']])),
            pa.Column.from_array(
                'feature',
                pa.array([['This should also be considered good.']])),
        ]

        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info', str='natural_language_domain {}'),
                statistics_pb2.CustomStatistic(
                    name='natural_language_match_rate', num=0.8333333)
            ]))
 def test_lift_no_categorical_features(self):
     examples = [
         pa.Table.from_arrays([
             pa.array([[1.0], [2.0], [3.0], [4.0]]),
             pa.array([[1], [0], [1], [0]]),
         ], ['continous_x', 'int_y']),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'continuous_x'
       type: FLOAT
     }
     feature {
       name: 'int_y'
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     expected_result = []
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema, y_path=types.FeaturePath(['int_y']))
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
Beispiel #9
0
    def test_batch_examples(self):
        examples = [{
            'a': np.array([1.0, 2.0], dtype=np.float32),
            'b': np.array(['a', 'b', 'c', 'e'])
        }, {
            'a': np.array([3.0, 4.0, 5.0], dtype=np.float32),
        }, {
            'b': np.array(['d', 'e', 'f']),
            'd': np.array([10, 20, 30], dtype=np.int64),
        }, {
            'b': np.array(['a', 'b', 'c'])
        }, {
            'c': np.array(['d', 'e', 'f'])
        }]
        expected_tables = [
            pa.Table.from_arrays([
                pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]],
                         type=pa.list_(pa.float32())),
                pa.array([['a', 'b', 'c', 'e'], None])
            ], ['a', 'b']),
            pa.Table.from_arrays([
                pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]),
                pa.array([[10, 20, 30], None], type=pa.list_(pa.int64()))
            ], ['b', 'd']),
            pa.Table.from_arrays([pa.array([['d', 'e', 'f']])], ['c']),
        ]

        with beam.Pipeline() as p:
            result = (
                p
                | beam.Create(examples)
                | batch_util.BatchExamplesToArrowTables(desired_batch_size=2))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_tables))
    def test_mi_with_missing_label_key(self):
        batch = pa.Table.from_arrays(
            [pa.array([[1]]), pa.array([[1]])], ["label", "fa"])

        schema = text_format.Parse(
            """
          feature {
            name: "fa"
            type: FLOAT
              shape {
              dim {
                size: 1
              }
            }
          }
          feature {
            name: "label"
            type: FLOAT
            shape {
              dim {
                size: 1
              }
            }
          }
          """, schema_pb2.Schema())

        with self.assertRaisesRegexp(
                ValueError, "Feature label_key not found in the schema."):
            sklearn_mutual_information.SkLearnMutualInformation(
                types.FeaturePath(["label_key"]), schema,
                TEST_SEED).compute(batch)
    def test_mi_with_multivalent_label(self):
        batch = pa.Table.from_arrays(
            [pa.array([[1, 2]]), pa.array([[1]])], ["label_key", "fa"])
        schema = text_format.Parse(
            """
          feature {
            name: "fa"
            type: FLOAT
            shape {
              dim {
                size: 1
              }
            }
          }
          feature {
            name: "label_key"
            type: FLOAT
            value_count: {
              min: 1
              max: 2
            }
          }
          """, schema_pb2.Schema())

        with self.assertRaisesRegexp(
                ValueError, "Label column contains unsupported data."):
            sklearn_mutual_information.SkLearnMutualInformation(
                types.FeaturePath(["label_key"]), schema,
                TEST_SEED).compute(batch)
 def test_time_stats_generator_match_ratio_with_same_valid_format(self):
     """Tests match ratio where all valid values have the same format."""
     input_batches = [
         pa.Column.from_array(
             'feature',
             pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                       ['2018-11-30', '2018-11-30']])),
         pa.Column.from_array(
             'feature',
             pa.array([['not-valid', 'not-valid', 'not-valid'],
                       ['not-valid', 'not-valid']])),
     ]
     # Try generator with match_ratio 0.51 (should not create stats).
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.51,
                                                         values_threshold=5)
     self.assertCombinerOutputEqual(input_batches, generator,
                                    statistics_pb2.FeatureNameStatistics())
     # Try generator with match_ratio 0.49 (should create stats).
     generator = time_stats_generator.TimeStatsGenerator(match_ratio=0.49,
                                                         values_threshold=5)
     self.assertCombinerOutputEqual(
         input_batches, generator,
         statistics_pb2.FeatureNameStatistics(custom_stats=[
             statistics_pb2.CustomStatistic(
                 name='domain_info',
                 str="time_domain {string_format: '%Y-%m-%d'}"),
             statistics_pb2.CustomStatistic(name='time_match_ratio',
                                            num=0.50),
         ]))
    def test_time_stats_generator_values_threshold_check(self):
        """Tests generator values threshold."""
        # Expected to give 6 matches with the same format.
        input_batches = [
            pa.Column.from_array(
                'feature',
                pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                          ['2018-11-30']])),
            pa.Column.from_array('feature',
                                 pa.array([['2018-11-30', '2018-11-30']])),
            pa.Column.from_array('feature', pa.array([None, None])),
        ]
        # Try generator with values_threshold=7 (should not create stats).
        generator = time_stats_generator.TimeStatsGenerator(values_threshold=7)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())

        # Try generator with values_threshold=6 (should create stats).
        generator = time_stats_generator.TimeStatsGenerator(values_threshold=6)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info',
                    str="time_domain {string_format: '%Y-%m-%d'}"),
                statistics_pb2.CustomStatistic(name='time_match_ratio',
                                               num=1.0),
            ]))
    def test_invalid_input_type(self):

        functions_expecting_list_array = [
            arrow_util.ListLengthsFromListArray,
            arrow_util.GetFlattenedArrayParentIndices,
        ]
        functions_expecting_array = [arrow_util.GetArrayNullBitmapAsByteArray]
        functions_expecting_binary_array = [
            arrow_util.GetBinaryArrayTotalByteSize
        ]
        for f in itertools.chain(functions_expecting_list_array,
                                 functions_expecting_array,
                                 functions_expecting_binary_array):
            with self.assertRaisesRegex(RuntimeError,
                                        "Could not unwrap Array"):
                f(1)

        for f in functions_expecting_list_array:
            with self.assertRaisesRegex(RuntimeError,
                                        "Expected ListArray but got"):
                f(pa.array([1, 2, 3]))

        for f in functions_expecting_binary_array:
            with self.assertRaisesRegex(RuntimeError, "Expected BinaryArray"):
                f(pa.array([[1, 2, 3]]))
    def test_nl_generator_match_ratio_check(self):
        """Tests generator match ratio with fake heuristic."""
        input_batches = [
            pa.array([['MATCH', 'MATCH', 'MATCH'], ['MATCH', 'Nope']]),
            pa.array([['MATCH', 'MATCH', 'MATCH']]),
            pa.array([['12345', 'No']]),
        ]
        # Set values_threshold=5 so it always passes.
        # Try generators with match_ratio 0.71 (should not create stats) and
        # 0.69 (should create stats)
        generator = nlsg.NLStatsGenerator(_FakeHeuristic(),
                                          match_ratio=0.71,
                                          values_threshold=5)
        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())

        generator = nlsg.NLStatsGenerator(_FakeHeuristic(),
                                          match_ratio=0.69,
                                          values_threshold=5)
        self.assertCombinerOutputEqual(
            input_batches, generator,
            statistics_pb2.FeatureNameStatistics(custom_stats=[
                statistics_pb2.CustomStatistic(
                    name='domain_info', str='natural_language_domain {}'),
                statistics_pb2.CustomStatistic(
                    name='natural_language_match_rate', num=0.7)
            ]))
Beispiel #16
0
    def test_stats_pipeline_with_sample_count(self):
        # input with three tables.
        tables = [
            pa.Table.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
            pa.Table.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
            pa.Table.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
        ]

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                sample_count=3000,
                num_top_values=2,
                num_rank_histogram_buckets=2,
                num_values_histogram_buckets=2,
                num_histogram_buckets=2,
                num_quantiles_histogram_buckets=2,
                epsilon=0.001,
                desired_batch_size=3000)
            result = (p | beam.Create(tables)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, self._sampling_test_expected_result))
Beispiel #17
0
    def test_csv_decoder_with_schema(self):
        input_lines = ['1,1,2.0,hello', '5,5,12.34,world']
        column_names = [
            'int_feature_parsed_as_float', 'int_feature', 'float_feature',
            'str_feature'
        ]
        schema = text_format.Parse(
            """
        feature { name: "int_feature_parsed_as_float" type: FLOAT }
        feature { name: "int_feature" type: INT }
        feature { name: "float_feature" type: FLOAT }
        feature { name: "str_feature" type: BYTES }
        """, schema_pb2.Schema())
        expected_result = [
            pa.Table.from_arrays([
                pa.array([[1], [5]], pa.list_(pa.float32())),
                pa.array([[1], [5]], pa.list_(pa.int64())),
                pa.array([[2.0], [12.34]], pa.list_(pa.float32())),
                pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())),
            ], [
                'int_feature_parsed_as_float', 'int_feature', 'float_feature',
                'str_feature'
            ])
        ]

        with beam.Pipeline() as p:
            result = (p | beam.Create(input_lines)
                      | csv_decoder.DecodeCSV(column_names=column_names,
                                              schema=schema,
                                              infer_type_from_schema=True))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_result))
Beispiel #18
0
 def test_image_stats_generator_real_image(self):
     test_data_dir = os.path.join(os.path.dirname(__file__), 'testdata')
     batches = [
         pa.array([
             [
                 _read_file(os.path.join(test_data_dir, 'image1.gif')),
                 _read_file(os.path.join(test_data_dir, 'image2.png')),
                 _read_file(os.path.join(test_data_dir, 'not_a_image.abc'))
             ],
             [
                 _read_file(os.path.join(test_data_dir, 'image3.bmp')),
                 b'not_a_image'
             ],
         ]),
         pa.array([[
             _read_file(os.path.join(test_data_dir, 'image4.png')),
         ]]),
     ]
     expected_result = text_format.Parse(
         """
         custom_stats {
           name: 'domain_info'
           str: 'image_domain {}'
         }
         custom_stats {
           name: 'image_format_histogram'
           rank_histogram {
             buckets {
               label: 'UNKNOWN'
               sample_count: 2
             }
             buckets {
               label: 'bmp'
               sample_count: 1
             }
             buckets {
               label: 'gif'
               sample_count: 1
             }
             buckets {
               label: 'png'
               sample_count: 2
             }
           }
         }
         custom_stats {
           name: 'image_max_width'
           num: 51.0
         }
         custom_stats {
           name: 'image_max_height'
           num: 26.0
         }
         """, statistics_pb2.FeatureNameStatistics())
     generator = image_stats_generator.ImageStatsGenerator(
         is_image_ratio_threshold=0.6,
         values_threshold=1,
         enable_size_stats=True)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Beispiel #19
0
 def test_image_stats_generator_disable_size_stats(self):
     """Test the enable_size_stats_option."""
     # Identical input to test_image_stats_generator_check_is_image_ratio
     batches = [
         pa.array([
             [
                 FakeImageDecoder.encode_image_metadata('PNG', 2, 4),
                 FakeImageDecoder.encode_image_metadata('JPEG', 4, 2),
             ],
             [
                 FakeImageDecoder.encode_image_metadata('TIFF', 5, 1),
                 FakeImageDecoder.encode_image_metadata('', -1, -1),
                 FakeImageDecoder.encode_image_metadata('TIFF', 3, 7)
             ],
         ]),
         pa.array([[
             FakeImageDecoder.encode_image_metadata('GIF', 2, 1),
         ]]),
     ]
     # Stats should be identical but without stats for image size.
     expected_result = text_format.Parse(
         """
         custom_stats {
           name: 'domain_info'
           str: 'image_domain {}'
         }
         custom_stats {
           name: 'image_format_histogram'
           rank_histogram {
             buckets {
               label: 'UNKNOWN'
               sample_count: 1
             }
             buckets {
               label: 'GIF'
               sample_count: 1
             }
             buckets {
               label: 'JPEG'
               sample_count: 1
             }
             buckets {
               label: 'PNG'
               sample_count: 1
             }
             buckets {
               label: 'TIFF'
               sample_count: 2
             }
           }
         }
         """, statistics_pb2.FeatureNameStatistics())
     image_decoder = FakeImageDecoder()
     generator = image_stats_generator.ImageStatsGenerator(
         image_decoder=image_decoder,
         is_image_ratio_threshold=0.8,
         values_threshold=1,
         enable_size_stats=False)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
 def testGetArrayEmptyPath(self):
     with self.assertRaisesRegex(KeyError,
                                 r"query_path must be non-empty.*"):
         arrow_util.get_array(pa.Table.from_arrays(
             [pa.array([[1], [2, 3]]),
              pa.array([[1], [2, 2]])], ["v", "w"]),
                              query_path=types.FeaturePath([]),
                              broadcast_column_name="w")
    def test_get_flattened_array_parent_indices(self):
        indices = arrow_util.GetFlattenedArrayParentIndices(
            pa.array([], type=pa.list_(pa.int32())))
        self.assertTrue(indices.equals(pa.array([], type=pa.int32())))

        indices = arrow_util.GetFlattenedArrayParentIndices(
            pa.array([[1.], [2.], [], [3.]]))
        self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
  def test_flatten_list_array(self):
    flattened = arrow_util.FlattenListArray(
        pa.array([], type=pa.list_(pa.int64())))
    self.assertTrue(flattened.equals(pa.array([], type=pa.int64())))

    flattened = arrow_util.FlattenListArray(
        pa.array([[1.], [2.], [], [3.]]))
    self.assertTrue(flattened.equals(pa.array([1., 2., 3.])))
 def testGetBroadcastableColumnTooManyValues(self):
     with self.assertRaisesRegex(
             ValueError,
             r'Column "w" must have exactly one value in each example\.'):
         arrow_util.get_broadcastable_column(pa.Table.from_arrays(
             [pa.array([[1], [2, 3]]),
              pa.array([[1], [2, 2]])], ["v", "w"]),
                                             column_name="w")
 def test_basic_stats_generator_categorical_feature(self):
     batches = [
         pa.Table.from_arrays([pa.array([[1, 5, 10], [0]])], ['c']),
         pa.Table.from_arrays([pa.array([[1, 1, 1, 5, 15], [-1]])], ['c']),
     ]
     expected_result = {
         types.FeaturePath(['c']):
         text_format.Parse(
             """
         path {
           step: 'c'
         }
         string_stats {
           common_stats {
             num_non_missing: 4
             min_num_values: 1
             max_num_values: 5
             avg_num_values: 2.5
             num_values_histogram {
               buckets {
                 low_value: 1.0
                 high_value: 1.0
                 sample_count: 1.3333333
               }
               buckets {
                 low_value: 1.0
                 high_value: 3.0
                 sample_count: 1.3333333
               }
               buckets {
                 low_value: 3.0
                 high_value: 5.0
                 sample_count: 1.3333333
               }
               type: QUANTILES
             }
             tot_num_values: 10
           }
           avg_length: 1.29999995232
         }
         """, statistics_pb2.FeatureNameStatistics())
     }
     schema = text_format.Parse(
         """
     feature {
       name: "c"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     generator = basic_stats_generator.BasicStatsGenerator(
         schema=schema,
         num_values_histogram_buckets=3,
         num_histogram_buckets=3,
         num_quantiles_histogram_buckets=4)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_basic_stats_generator_feature_with_different_types(self):
     batches = [
         pa.Table.from_arrays([pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]])],
                              ['a']),
         pa.Table.from_arrays([pa.array([[1]])], ['a']),
     ]
     generator = basic_stats_generator.BasicStatsGenerator()
     with self.assertRaisesRegexp(TypeError, 'Cannot determine the type'):
         self.assertCombinerOutputEqual(batches, generator, None)
 def testInvalidWeightColumnStringValues(self):
     with self.assertRaisesRegex(
             ValueError, 'Weight feature "w" must be of numeric type.*'):
         for _ in arrow_util.enumerate_arrays(pa.Table.from_arrays(
             [pa.array([[1], [2, 3]]),
              pa.array([["two"], ["two"]])], ["v", "w"]),
                                              weight_column="w",
                                              enumerate_leaves_only=False):
             pass
 def testInvalidWeightColumn(self):
     with self.assertRaisesRegex(
             ValueError,
             "weight feature must have exactly one value in each example"):
         for _ in arrow_util.enumerate_arrays(pa.Table.from_arrays(
             [pa.array([[1], [2, 3]]),
              pa.array([[1], []])], ["v", "w"]),
                                              weight_column="w",
                                              enumerate_leaves_only=False):
             pass
Beispiel #28
0
 def test_all_null_mask_one_null(self):
   batch = input_batch.InputBatch(
       pa.Table.from_arrays(
           [pa.array([[1], [1]]),
            pa.array([None, None], type=pa.null())], ['f1', 'f2']))
   path1 = types.FeaturePath(['f1'])
   path2 = types.FeaturePath(['f2'])
   expected_mask = np.array([False, False])
   np.testing.assert_array_equal(
       batch.all_null_mask(path1, path2), expected_mask)
    def test_nl_generator_avg_word_heuristic_non_match(self):
        """Tests generator with avg word length heuristic."""
        generator = nlsg.NLStatsGenerator(values_threshold=2)
        input_batches = [
            pa.array([['abc' * 10, 'xxxxxxxxx'], ['xosuhddsofuhg123fdgosh']]),
            pa.array([['Only one valid text?']]),
        ]

        self.assertCombinerOutputEqual(input_batches, generator,
                                       statistics_pb2.FeatureNameStatistics())
 def test_count_missing_generator_required_path(self):
   batch = input_batch.InputBatch(
       pa.Table.from_arrays(
           [pa.array([[1], None, []]),
            pa.array([[1], None, []])], ['index', 'value']))
   path = types.FeaturePath(['index'])
   required_path = types.FeaturePath(['value'])
   generator = count_missing_generator.CountMissingGenerator(
       path, [required_path])
   accumulator = generator.create_accumulator()
   accumulator = generator.add_input(accumulator, batch)
   self.assertEqual(0, generator.extract_output(accumulator))