def test_cross_feature_stats_generator_multivalent_feature(self): generator = cross_feature_stats_generator.CrossFeatureStatsGenerator( sample_rate=1.0) b1 = pa.RecordBatch.from_arrays( [pa.array([[1.0], [3.0], [5.0]]), pa.array([[2.0], [4.0], [6.0]])], ['a', 'b']) b2 = pa.RecordBatch.from_arrays([ pa.array([[6.0], [10.0], [1.0, 2.0]]), pa.array([[14.0], [16.0], [3.9]]) ], ['a', 'b']) batches = [b1, b2] expected_result = { ('a', 'b'): text_format.Parse( """ path_x { step: "a" } path_y { step: "b" } count: 5 num_cross_stats { correlation: 0.923145 covariance: 15.6 } """, statistics_pb2.CrossFeatureStatistics()) } self.assertCombinerOutputEqual(batches, generator, {}, expected_result)
def test_cross_feature_stats_generator(self): generator = cross_feature_stats_generator.CrossFeatureStatsGenerator( sample_rate=1.0) b1 = pa.RecordBatch.from_arrays([ pa.array([[1.0], [3.0], [5.0]]), pa.array([[2.0], [4.0], [6.0]]), pa.array([[5.0], [3.0], [7.0]]), ], ['a', 'b', 'c']) b2 = pa.RecordBatch.from_arrays([ pa.array([[6.0], [10.0]]), pa.array([[14.0], [16.0]]), pa.array([[-1.0], [0]]), ], ['a', 'b', 'c']) b3 = pa.RecordBatch.from_arrays([ pa.array([None, None], type=pa.null()), pa.array([None, None], type=pa.null()), pa.array([None, None], type=pa.null()), ], ['a', 'b', 'c']) batches = [b1, b2, b3] expected_result = { ('a', 'b'): text_format.Parse( """ path_x { step: "a" } path_y { step: "b" } count: 5 num_cross_stats { correlation: 0.923145 covariance: 15.6 } """, statistics_pb2.CrossFeatureStatistics()), ('a', 'c'): text_format.Parse( """ path_x { step: "a" } path_y { step: "c" } count: 5 num_cross_stats { correlation: -0.59476602 covariance: -5.4000001 } """, statistics_pb2.CrossFeatureStatistics()), ('b', 'c'): text_format.Parse( """ path_x { step: "b" } path_y { step: "c" } count: 5 num_cross_stats { correlation: -0.81070298 covariance: -13.52 } """, statistics_pb2.CrossFeatureStatistics()) } self.assertCombinerOutputEqual(batches, generator, {}, expected_result)
def test_cross_feature_stats_generator_string_feature(self): generator = cross_feature_stats_generator.CrossFeatureStatsGenerator( sample_rate=1.0) b1 = pa.RecordBatch.from_arrays( [pa.array([['x'], ['y']]), pa.array([[2.0], [4.0]])], ['a', 'b']) b2 = pa.RecordBatch.from_arrays( [pa.array([['a'], ['b']]), pa.array([[14.0], [16.0]])], ['a', 'b']) batches = [b1, b2] self.assertCombinerOutputEqual(batches, generator, {}, {})
def test_cross_feature_stats_generator_single_feature(self): generator = cross_feature_stats_generator.CrossFeatureStatsGenerator( sample_rate=1.0) b1 = pa.RecordBatch.from_arrays([pa.array([[1.0], [3.0]])], ['a']) self.assertCombinerOutputEqual([b1], generator, {}, {})