def execute_aggregate(view: View): ''' Aggregate data points on an axis for bar or line charts Parameters ---------- view: lux.View lux.View object that represents a visualization ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with specified context. Returns ------- None ''' import numpy as np x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] groupby_attr = "" measure_attr = "" if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation all_attr_vals = view.data.unique_values[groupby_attr.attribute] if (measure_attr != ""): if (measure_attr.attribute == "Record"): view.data = view.data.reset_index() view.data = view.data.groupby( groupby_attr.attribute).count().reset_index() view.data = view.data.rename(columns={"index": "Record"}) view.data = view.data[[groupby_attr.attribute, "Record"]] else: groupby_result = view.data.groupby(groupby_attr.attribute) view.data = groupby_result.agg(agg_func).reset_index() result_vals = list(view.data[groupby_attr.attribute]) if (len(result_vals) != len(all_attr_vals)): # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints for vals in all_attr_vals: if (vals not in result_vals): view.data.loc[len(view.data)] = [vals, 0] assert len(list(view.data[groupby_attr.attribute])) == len( all_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." view.data = view.data.sort_values(by=groupby_attr.attribute, ascending=True) view.data = view.data.reset_index() view.data = view.data.drop(columns="index")
def execute_aggregate(view: View, ldf: LuxDataFrame): import pandas as pd x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] groupby_attr = "" measure_attr = "" if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if (measure_attr != ""): #barchart case, need count data for each group if (measure_attr.attribute == "Record"): where_clause, filterVars = SQLExecutor.execute_filter(view) count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format( groupby_attr.attribute, groupby_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(count_query, ldf.SQLconnection) view.data = view.data.rename(columns={"count": "Record"}) view.data = utils.pandas_to_lux(view.data) else: where_clause, filterVars = SQLExecutor.execute_filter(view) if agg_func == "mean": mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "sum": mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "max": mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data)
def interestingness(view: View, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the view. The interestingness metric is dependent on the view type. Parameters ---------- view : View ldf : LuxDataFrame Returns ------- int Interestingness Score """ if view.data is None: raise Exception( "View.data needs to be populated before interestingness can be computed. Run Executor.execute(view,ldf)." ) n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(view.spec_lst) view_attrs_specs = utils.get_attrs_specs(view.spec_lst) for spec in view_attrs_specs: if (spec.attribute != "Record"): if (spec.data_model == 'dimension'): n_dim += 1 if (spec.data_model == 'measure'): n_msr += 1 n_filter = len(filter_specs) attr_specs = [ spec for spec in view_attrs_specs if spec.attribute != "Record" ] dimension_lst = view.get_attr_by_data_model("dimension") measure_lst = view.get_attr_by_data_model("measure") # Bar Chart if (n_dim == 1 and (n_msr == 0 or n_msr == 1)): if (n_filter == 0): return unevenness(view, ldf, measure_lst, dimension_lst) elif (n_filter == 1): return deviation_from_overall(view, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif (n_dim == 0 and n_msr == 1): if (n_filter == 0): v = view.data["Count of Records"] return skewness(v) elif (n_filter == 1): return deviation_from_overall(view, ldf, filter_specs, "Count of Records") # Scatter Plot elif (n_dim == 0 and n_msr == 2): if (n_filter == 1): v_filter_size = get_filtered_size(filter_specs, view.data) v_size = len(view.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(view, attr_specs) # Scatterplot colored by Dimension elif (n_dim == 1 and n_msr == 2): color_attr = view.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if (C < 40): return 1 / C else: return -1 # Scatterplot colored by dimension elif (n_dim == 1 and n_msr == 2): return 0.2 # Scatterplot colored by measure elif (n_msr == 3): return 0.1 # Default else: return -1