Beispiel #1
0
    def compute_stats(self, ldf: LuxDataFrame):
        # precompute statistics
        ldf.unique_values = {}
        ldf._min_max = {}
        ldf.cardinality = {}

        for attribute in ldf.columns:

            if (isinstance(attribute, pd._libs.tslibs.timestamps.Timestamp)):
                # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05')
                attribute_repr = str(attribute._date_repr)
            else:
                attribute_repr = attribute
            if ldf.dtypes[
                    attribute] != "float64":  # and not pd.api.types.is_datetime64_ns_dtype(self.dtypes[attribute]):
                ldf.unique_values[attribute_repr] = list(
                    ldf[attribute].unique())
                ldf.cardinality[attribute_repr] = len(
                    ldf.unique_values[attribute])
            else:
                ldf.cardinality[
                    attribute_repr] = 999  # special value for non-numeric attribute
            if ldf.dtypes[attribute] == "float64" or ldf.dtypes[
                    attribute] == "int64":
                ldf._min_max[attribute_repr] = (ldf[attribute].min(),
                                                ldf[attribute].max())
        if (ldf.index.dtype != 'int64'):
            index_column_name = ldf.index.name
            ldf.unique_values[index_column_name] = list(ldf.index)
            ldf.cardinality[index_column_name] = len(ldf.index)
Beispiel #2
0
    def compute_stats(self, ldf: LuxDataFrame):
        # precompute statistics
        ldf.unique_values = {}
        ldf._min_max = {}
        ldf.cardinality = {}

        for attribute in ldf.columns:

            if isinstance(attribute, pd._libs.tslibs.timestamps.Timestamp):
                # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05')
                attribute_repr = str(attribute._date_repr)
            else:
                attribute_repr = attribute

            ldf.unique_values[attribute_repr] = list(
                ldf[attribute_repr].unique())
            ldf.cardinality[attribute_repr] = len(
                ldf.unique_values[attribute_repr])

            if pd.api.types.is_float_dtype(
                    ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(
                        ldf.dtypes[attribute]):
                ldf._min_max[attribute_repr] = (
                    ldf[attribute].min(),
                    ldf[attribute].max(),
                )

        if not pd.api.types.is_integer_dtype(ldf.index):
            index_column_name = ldf.index.name
            ldf.unique_values[index_column_name] = list(ldf.index)
            ldf.cardinality[index_column_name] = len(ldf.index)