def test_get_flattened_array_parent_indices(self): indices = arrow_util.GetFlattenedArrayParentIndices( pa.array([], type=pa.list_(pa.int32()))) self.assertTrue(indices.equals(pa.array([], type=pa.int32()))) indices = arrow_util.GetFlattenedArrayParentIndices( pa.array([[1.], [2.], [], [3.]])) self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
def test_list_lengths(self): list_lengths = arrow_util.ListLengthsFromListArray( pa.array([], type=pa.list_(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int32()))) list_lengths = arrow_util.ListLengthsFromListArray( pa.array([[1., 2.], [], [3.]])) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32()))) list_lengths = arrow_util.ListLengthsFromListArray( pa.array([[1., 2.], None, [3.]])) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
def test_get_array_null_bitmap_as_byte_array(self): array = pa.array([], type=pa.int32()) null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array) self.assertTrue(null_masks.equals(pa.array([], type=pa.uint8()))) array = pa.array([1, 2, None, 3, None], type=pa.int32()) null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array) self.assertTrue( null_masks.equals(pa.array([0, 0, 1, 0, 1], type=pa.uint8()))) array = pa.array([1, 2, 3]) null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array) self.assertTrue(null_masks.equals(pa.array([0, 0, 0], type=pa.uint8()))) array = pa.array([None, None, None], type=pa.int32()) null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array) self.assertTrue(null_masks.equals(pa.array([1, 1, 1], type=pa.uint8()))) # Demonstrate that the returned array can be converted to a numpy boolean # array w/o copying np.testing.assert_equal( np.array([True, True, True]), null_masks.to_numpy().view(np.bool))
{ "int64_feature": np.array([4], dtype=np.int64), "int32_feature": np.array([4], dtype=np.int32), "float_feature": np.array([2., 3., 4.], dtype=np.float32), "double_feature": np.array([2., 3., 4.], dtype=np.float64), "bytes_feature": np.array([b"ghi"], dtype=np.object), "unicode_feature": np.array([u"ghi"], dtype=np.object), }, ], expected_output={ "int64_feature": pa.array([[1, 2, 3], [4]], type=pa.list_(pa.int64())), "uint64_feature": pa.array([[1, 2, 3], None], type=pa.list_(pa.uint64())), "int32_feature": pa.array([[1, 2, 3], [4]], type=pa.list_(pa.int32())), "uint32_feature": pa.array([[1, 2, 3], None], type=pa.list_(pa.uint32())), "float_feature": pa.array([[1.], [2., 3., 4.]], type=pa.list_(pa.float32())), "double_feature": pa.array([[1.], [2., 3., 4.]], type=pa.list_(pa.float64())), "bytes_feature": pa.array([[b"abc", b"def"], [b"ghi"]], type=pa.list_(pa.binary())), "unicode_feature": pa.array([[b"abc", b"def"], [b"ghi"]], type=pa.list_(pa.string())), }), dict(testcase_name="mixed_unicode_and_bytes", input_examples=[
def test_stats_pipeline_with_examples_with_no_values(self): tables = [ pa.Table.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']), pa.Table.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']), pa.Table.from_arrays([ pa.array([[]], type=pa.list_(pa.float32())), pa.array([[]], type=pa.list_(pa.binary())), pa.array([[]], type=pa.list_(pa.int32())), pa.array([[2]]), ], ['a', 'b', 'c', 'w']) ] expected_result = text_format.Parse( """ datasets{ num_examples: 3 features { path { step: 'a' } type: FLOAT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'b' } type: STRING string_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } features { path { step: 'c' } type: INT num_stats { common_stats { num_non_missing: 3 num_values_histogram { buckets { sample_count: 1.5 } buckets { sample_count: 1.5 } type: QUANTILES } weighted_common_stats { num_non_missing: 6 } } } } } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: options = stats_options.StatsOptions( weight_feature='w', num_top_values=1, num_rank_histogram_buckets=1, num_values_histogram_buckets=2, num_histogram_buckets=1, num_quantiles_histogram_buckets=1, epsilon=0.001) result = (p | beam.Create(tables) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))