Esempio n. 1
0
    def execute_aggregate(view: View):
        '''
        Aggregate data points on an axis for bar or line charts

        Parameters
        ----------
        view: lux.View
            lux.View object that represents a visualization
        ldf : lux.luxDataFrame.LuxDataFrame
            LuxDataFrame with specified context.

        Returns
        -------
        None
        '''
        import numpy as np
        x_attr = view.get_attr_by_channel("x")[0]
        y_attr = view.get_attr_by_channel("y")[0]
        groupby_attr = ""
        measure_attr = ""
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        all_attr_vals = view.data.unique_values[groupby_attr.attribute]
        if (measure_attr != ""):
            if (measure_attr.attribute == "Record"):
                view.data = view.data.reset_index()
                view.data = view.data.groupby(
                    groupby_attr.attribute).count().reset_index()
                view.data = view.data.rename(columns={"index": "Record"})
                view.data = view.data[[groupby_attr.attribute, "Record"]]
            else:
                groupby_result = view.data.groupby(groupby_attr.attribute)
                view.data = groupby_result.agg(agg_func).reset_index()
            result_vals = list(view.data[groupby_attr.attribute])
            if (len(result_vals) != len(all_attr_vals)):
                # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
                for vals in all_attr_vals:
                    if (vals not in result_vals):
                        view.data.loc[len(view.data)] = [vals, 0]
            assert len(list(view.data[groupby_attr.attribute])) == len(
                all_attr_vals
            ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
            view.data = view.data.sort_values(by=groupby_attr.attribute,
                                              ascending=True)
            view.data = view.data.reset_index()
            view.data = view.data.drop(columns="index")
Esempio n. 2
0
    def execute_aggregate(view: View, ldf: LuxDataFrame):
        import pandas as pd
        x_attr = view.get_attr_by_channel("x")[0]
        y_attr = view.get_attr_by_channel("y")[0]
        groupby_attr = ""
        measure_attr = ""
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation

        if (measure_attr != ""):
            #barchart case, need count data for each group
            if (measure_attr.attribute == "Record"):
                where_clause, filterVars = SQLExecutor.execute_filter(view)
                count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format(
                    groupby_attr.attribute, groupby_attr.attribute,
                    ldf.table_name, where_clause, groupby_attr.attribute)
                view.data = pd.read_sql(count_query, ldf.SQLconnection)
                view.data = view.data.rename(columns={"count": "Record"})
                view.data = utils.pandas_to_lux(view.data)

            else:
                where_clause, filterVars = SQLExecutor.execute_filter(view)
                if agg_func == "mean":
                    mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)
                if agg_func == "sum":
                    mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)
                if agg_func == "max":
                    mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    view.data = pd.read_sql(mean_query, ldf.SQLconnection)
                    view.data = utils.pandas_to_lux(view.data)
Esempio n. 3
0
def interestingness(view: View, ldf: LuxDataFrame) -> int:
    """
	Compute the interestingness score of the view.
	The interestingness metric is dependent on the view type.

	Parameters
	----------
	view : View
	ldf : LuxDataFrame

	Returns
	-------
	int
		Interestingness Score
	"""

    if view.data is None:
        raise Exception(
            "View.data needs to be populated before interestingness can be computed. Run Executor.execute(view,ldf)."
        )

    n_dim = 0
    n_msr = 0

    filter_specs = utils.get_filter_specs(view.spec_lst)
    view_attrs_specs = utils.get_attrs_specs(view.spec_lst)

    for spec in view_attrs_specs:
        if (spec.attribute != "Record"):
            if (spec.data_model == 'dimension'):
                n_dim += 1
            if (spec.data_model == 'measure'):
                n_msr += 1
    n_filter = len(filter_specs)
    attr_specs = [
        spec for spec in view_attrs_specs if spec.attribute != "Record"
    ]
    dimension_lst = view.get_attr_by_data_model("dimension")
    measure_lst = view.get_attr_by_data_model("measure")

    # Bar Chart
    if (n_dim == 1 and (n_msr == 0 or n_msr == 1)):
        if (n_filter == 0):
            return unevenness(view, ldf, measure_lst, dimension_lst)
        elif (n_filter == 1):
            return deviation_from_overall(view, ldf, filter_specs,
                                          measure_lst[0].attribute)
    # Histogram
    elif (n_dim == 0 and n_msr == 1):
        if (n_filter == 0):
            v = view.data["Count of Records"]
            return skewness(v)
        elif (n_filter == 1):
            return deviation_from_overall(view, ldf, filter_specs,
                                          "Count of Records")
    # Scatter Plot
    elif (n_dim == 0 and n_msr == 2):
        if (n_filter == 1):
            v_filter_size = get_filtered_size(filter_specs, view.data)
            v_size = len(view.data)
            sig = v_filter_size / v_size
        else:
            sig = 1
        return sig * monotonicity(view, attr_specs)
    # Scatterplot colored by Dimension
    elif (n_dim == 1 and n_msr == 2):
        color_attr = view.get_attr_by_channel("color")[0].attribute

        C = ldf.cardinality[color_attr]
        if (C < 40):
            return 1 / C
        else:
            return -1
    # Scatterplot colored by dimension
    elif (n_dim == 1 and n_msr == 2):
        return 0.2
    # Scatterplot colored by measure
    elif (n_msr == 3):
        return 0.1
    # Default
    else:
        return -1