def _generate_partial_statistics_from_df( dataframe: DataFrame, stats_options: options.StatsOptions, stats_generators: List[stats_generator.CombinerStatsGenerator] ) -> List[Any]: """Generate accumulators containing partial stats.""" feature_whitelist = set() if stats_options.feature_whitelist: feature_whitelist.update(stats_options.feature_whitelist) # Create a copy of the stats options so that we don't modify the input object. stats_options_modified = copy.copy(stats_options) # Remove feature_whitelist option as it is no longer needed. stats_options_modified.feature_whitelist = None schema = schema_pb2.Schema() drop_columns = [] for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): if (not table_util.NumpyKindToArrowType(col_type.kind) or (feature_whitelist and col_name not in feature_whitelist)): drop_columns.append(col_name) elif col_type.kind == 'b': # Track bool type feature as categorical. schema.feature.add(name=col_name, type=schema_pb2.INT, bool_domain=schema_pb2.BoolDomain()) dataframe = dataframe.drop(columns=drop_columns) if schema.feature: stats_options_modified.schema = schema record_batch_with_primitive_arrays = table_util.DataFrameToRecordBatch( dataframe) record_batch_with_list_arrays = table_util.CanonicalizeRecordBatch( record_batch_with_primitive_arrays) return stats_impl.generate_partial_statistics_in_memory( record_batch_with_list_arrays, stats_options_modified, stats_generators)
def testDataFrameToRecordBatch(self): df_data = pd.DataFrame([{ "age": 17, "language": "english", "prediction": False, "label": False, "complex_var": 2 + 3j }, { "age": 30, "language": "spanish", "prediction": True, "label": True, "complex_var": 2 + 3j }]) expected_fields = {"age", "language", "prediction", "label"} expected_row_counts = collections.Counter({ (17, 30): 1, (0, 1): 2, (b"english", b"spanish"): 1 }) rb_data = table_util.DataFrameToRecordBatch(df_data) self.assertSetEqual(set(rb_data.schema.names), expected_fields) actual_row_counts = collections.Counter() for col in rb_data.columns: row = tuple(col.to_pylist()) actual_row_counts[row] += 1 self.assertDictEqual(actual_row_counts, expected_row_counts) canonicalized_rb_data = table_util.CanonicalizeRecordBatch(rb_data) self.assertSetEqual( set(canonicalized_rb_data.schema.names), expected_fields) actual_row_counts = collections.Counter() for col in canonicalized_rb_data.columns: col = col.to_pylist() row = (col[0][0], col[1][0]) actual_row_counts[row] += 1 self.assertDictEqual(actual_row_counts, expected_row_counts) expected_age_column = pa.array([[17], [30]], type=pa.list_(pa.int64())) expected_language_column = pa.array([["english"], ["spanish"]], type=pa.list_(pa.binary())) expected_prediction_column = pa.array([[0], [1]], type=pa.list_(pa.int8())) expected_label_column = pa.array([[0], [1]], type=pa.list_(pa.int8())) self.assertTrue( canonicalized_rb_data.column( canonicalized_rb_data.schema.get_field_index("age")).equals( expected_age_column)) self.assertTrue( canonicalized_rb_data.column( canonicalized_rb_data.schema.get_field_index("language")).equals( expected_language_column)) self.assertTrue( canonicalized_rb_data.column( canonicalized_rb_data.schema.get_field_index("prediction")).equals( expected_prediction_column)) self.assertTrue( canonicalized_rb_data.column( canonicalized_rb_data.schema.get_field_index("label")).equals( expected_label_column))