def compute_stats(self, ldf: LuxDataFrame): # precompute statistics ldf.unique_values = {} ldf._min_max = {} ldf.cardinality = {} for attribute in ldf.columns: if (isinstance(attribute, pd._libs.tslibs.timestamps.Timestamp)): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') attribute_repr = str(attribute._date_repr) else: attribute_repr = attribute if ldf.dtypes[ attribute] != "float64": # and not pd.api.types.is_datetime64_ns_dtype(self.dtypes[attribute]): ldf.unique_values[attribute_repr] = list( ldf[attribute].unique()) ldf.cardinality[attribute_repr] = len( ldf.unique_values[attribute]) else: ldf.cardinality[ attribute_repr] = 999 # special value for non-numeric attribute if ldf.dtypes[attribute] == "float64" or ldf.dtypes[ attribute] == "int64": ldf._min_max[attribute_repr] = (ldf[attribute].min(), ldf[attribute].max()) if (ldf.index.dtype != 'int64'): index_column_name = ldf.index.name ldf.unique_values[index_column_name] = list(ldf.index) ldf.cardinality[index_column_name] = len(ldf.index)
def compute_stats(self, ldf: LuxDataFrame): # precompute statistics ldf.unique_values = {} ldf._min_max = {} ldf.cardinality = {} for attribute in ldf.columns: if isinstance(attribute, pd._libs.tslibs.timestamps.Timestamp): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') attribute_repr = str(attribute._date_repr) else: attribute_repr = attribute ldf.unique_values[attribute_repr] = list( ldf[attribute_repr].unique()) ldf.cardinality[attribute_repr] = len( ldf.unique_values[attribute_repr]) if pd.api.types.is_float_dtype( ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype( ldf.dtypes[attribute]): ldf._min_max[attribute_repr] = ( ldf[attribute].min(), ldf[attribute].max(), ) if not pd.api.types.is_integer_dtype(ldf.index): index_column_name = ldf.index.name ldf.unique_values[index_column_name] = list(ldf.index) ldf.cardinality[index_column_name] = len(ldf.index)