def _generate_partial_statistics_from_df( dataframe: DataFrame, stats_options: options.StatsOptions, stats_generators: List[stats_generator.CombinerStatsGenerator] ) -> List[Any]: """Generate accumulators containing partial stats.""" feature_whitelist = set() if stats_options.feature_whitelist: feature_whitelist.update(stats_options.feature_whitelist) # Create a copy of the stats options so that we don't modify the input object. stats_options_modified = copy.copy(stats_options) # Remove feature_whitelist option as it is no longer needed. stats_options_modified.feature_whitelist = None schema = schema_pb2.Schema() drop_columns = [] for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): if (not table_util.NumpyKindToArrowType(col_type.kind) or (feature_whitelist and col_name not in feature_whitelist)): drop_columns.append(col_name) elif col_type.kind == 'b': # Track bool type feature as categorical. schema.feature.add(name=col_name, type=schema_pb2.INT, bool_domain=schema_pb2.BoolDomain()) dataframe = dataframe.drop(columns=drop_columns) if schema.feature: stats_options_modified.schema = schema record_batch_with_primitive_arrays = table_util.DataFrameToRecordBatch( dataframe) record_batch_with_list_arrays = table_util.CanonicalizeRecordBatch( record_batch_with_primitive_arrays) return stats_impl.generate_partial_statistics_in_memory( record_batch_with_list_arrays, stats_options_modified, stats_generators)
def _generate_partial_statistics_from_df( dataframe: pd.DataFrame, stats_options: options.StatsOptions, stats_generators: List[stats_generator.CombinerStatsGenerator] ) -> List[Any]: """Generate accumulators containing partial stats.""" inmemory_dicts = [{} for _ in range(len(dataframe))] isnull = pd.isnull # Initialize decoding fn based on column type. int_fn = lambda x: np.array([x], dtype=np.integer) float_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.floating) str_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.object) decode_fn = { # int type. 'i': int_fn, 'u': int_fn, # float type. 'f': float_fn, # bool type. 'b': int_fn, # string type. 'S': str_fn, 'O': str_fn, 'U': str_fn, } feature_whitelist = set() if stats_options.feature_whitelist: feature_whitelist.update(stats_options.feature_whitelist) # Create a copy of the stats options so that we don't modify the input object. stats_options_modified = copy.copy(stats_options) # Remove feature_whitelist option as it is no longer needed. stats_options_modified.feature_whitelist = None schema = schema_pb2.Schema() for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): kind = col_type.kind if (kind not in decode_fn or (feature_whitelist and col_name not in feature_whitelist)): logging.warning('Ignoring feature %s of type %s', col_name, col_type) continue if kind == 'b': # Track bool type feature as categorical. schema.feature.add(name=col_name, type=schema_pb2.INT, bool_domain=schema_pb2.BoolDomain()) # Get decoding fn based on column type. fn = decode_fn[kind] # Iterate over the column and apply the decoding fn. j = 0 for val in dataframe[col_name]: inmemory_dicts[j][col_name] = fn(val) j += 1 if schema.feature: stats_options_modified.schema = schema record_batch = decoded_examples_to_arrow.DecodedExamplesToRecordBatch( inmemory_dicts) return stats_impl.generate_partial_statistics_in_memory( record_batch, stats_options_modified, stats_generators)
def _generate_partial_statistics_from_df( dataframe: pd.DataFrame, stats_options: options.StatsOptions, stats_generators: List[stats_generator.CombinerStatsGenerator] ) -> List[Any]: """Generate accumulators containing partial stats.""" inmemory_dicts = [{} for _ in range(len(dataframe))] isnull = pd.isnull # Initialize decoding fn based on column type. int_fn = lambda x: np.array([x], dtype=np.integer) float_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.floating) str_fn = lambda x: None if isnull(x) else np.array([x], dtype=np.object) decode_fn = { # int type. 'i': int_fn, 'u': int_fn, # float type. 'f': float_fn, # bool type. 'b': int_fn, # string type. 'S': str_fn, 'O': str_fn, 'U': str_fn, } schema = schema_pb2.Schema() for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): kind = col_type.kind if kind not in decode_fn: logging.warning('Ignoring feature %s of type %s', col_name, col_type) continue if kind == 'b': # Track bool type feature as categorical. schema.feature.add(name=col_name, type=schema_pb2.INT, bool_domain=schema_pb2.BoolDomain()) # Get decoding fn based on column type. fn = decode_fn[kind] # Iterate over the column and apply the decoding fn. j = 0 for val in dataframe[col_name]: inmemory_dicts[j][col_name] = fn(val) j += 1 if schema.feature: stats_options.schema = schema return stats_impl.generate_partial_statistics_in_memory( decoded_examples_to_arrow.DecodedExamplesToTable(inmemory_dicts), stats_options, stats_generators)
def _generate_partial_statistics_from_df( dataframe: pd.DataFrame, stats_options: options.StatsOptions, stats_generators: List[stats_generator.CombinerStatsGenerator] ) -> List[Any]: """Generate accumulators containing partial stats.""" feature_whitelist = set() if stats_options.feature_whitelist: feature_whitelist.update(stats_options.feature_whitelist) # Create a copy of the stats options so that we don't modify the input object. stats_options_modified = copy.copy(stats_options) # Remove feature_whitelist option as it is no longer needed. stats_options_modified.feature_whitelist = None schema = schema_pb2.Schema() arrow_fields = [] for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): kind = col_type.kind if (kind not in _NUMPY_KIND_TO_ARROW_TYPE or (feature_whitelist and col_name not in feature_whitelist)): logging.warning('Ignoring feature %s of type %s', col_name, col_type) continue if kind == 'b': # Track bool type feature as categorical. schema.feature.add(name=col_name, type=schema_pb2.INT, bool_domain=schema_pb2.BoolDomain()) arrow_fields.append(pa.field(col_name, _NUMPY_KIND_TO_ARROW_TYPE[kind])) if schema.feature: stats_options_modified.schema = schema record_batch_with_primitive_arrays = pa.RecordBatch.from_pandas( dataframe, schema=pa.schema(arrow_fields)) arrays = [] for column_array in record_batch_with_primitive_arrays.columns: arrays.append(array_util.ToSingletonListArray(column_array)) # TODO(pachristopher): Consider using a list of record batches instead of a # single record batch to avoid having list arrays larger than 2^31 elements. record_batch_with_list_arrays = pa.RecordBatch.from_arrays( arrays, record_batch_with_primitive_arrays.schema.names) return stats_impl.generate_partial_statistics_in_memory( record_batch_with_list_arrays, stats_options_modified, stats_generators)
'feature_name': 'x', 'domain': schema_pb2.StringDomain(value=['a', 'b']), 'output_schema_proto_text': ''' feature { name: 'x' string_domain { value: 'a' value: 'b' } }''' }, { 'testcase_name': 'bool_domain', 'input_schema_proto_text': '''feature { name: 'x' }''', 'feature_name': 'x', 'domain': schema_pb2.BoolDomain(true_value='T', false_value='F'), 'output_schema_proto_text': ''' feature { name: 'x' bool_domain { true_value: 'T' false_value: 'F' } } ''' }, { 'testcase_name': 'global_domain', 'input_schema_proto_text': ''' string_domain { name: 'global_domain' value: 'a' value: 'b' } feature { name: 'x' }''', 'feature_name': 'x', 'domain': 'global_domain',