def test_vis_custom_aggregation_as_str(): df = pd.read_csv("lux/data/college.csv") import numpy as np vis = Vis(["HighestDegree", lux.Clause("AverageCost", aggregation="max")], df) assert vis.get_attr_by_data_model("measure")[0].aggregation == "max" assert vis.get_attr_by_data_model("measure")[0]._aggregation_name == "max"
def test_vis_custom_aggregation_as_numpy_func(): df = pd.read_csv("lux/data/college.csv") from lux.vis.Vis import Vis import numpy as np vis = Vis(["HighestDegree",lux.Clause("AverageCost",aggregation=np.ptp)],df) assert vis.get_attr_by_data_model("measure")[0].aggregation == np.ptp assert vis.get_attr_by_data_model("measure")[0]._aggregation_name =='ptp'
def test_vis_custom_aggregation_as_str(global_var): df = pytest.college_df import numpy as np vis = Vis(["HighestDegree", lux.Clause("AverageCost", aggregation="max")], df) assert vis.get_attr_by_data_model("measure")[0].aggregation == "max" assert vis.get_attr_by_data_model("measure")[0]._aggregation_name == "max"
def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str) -> int: """ Difference in bar chart/histogram shape from overall chart Note: this function assumes that the filtered vis.data is operating on the same range as the unfiltered vis.data. Parameters ---------- vis : Vis ldf : LuxDataFrame filter_specs : list List of filters from the Vis msr_attribute : str The attribute name of the measure value of the chart Returns ------- int Score describing how different the vis is from the overall vis """ v_filter_size = get_filtered_size(filter_specs, ldf) v_size = len(vis.data) v_filter = vis.data[msr_attribute] total = v_filter.sum() v_filter = v_filter / total # normalize by total to get ratio if total == 0: return 0 # Generate an "Overall" Vis (TODO: This is computed multiple times for every vis, alternative is to directly access df.current_vis but we do not have guaruntee that will always be unfiltered vis (in the non-Filter action scenario)) import copy unfiltered_vis = copy.copy(vis) # Remove filters, keep only attribute intent unfiltered_vis._inferred_intent = utils.get_attrs_specs(vis._inferred_intent) ldf.executor.execute([unfiltered_vis], ldf) v = unfiltered_vis.data[msr_attribute] v = v / v.sum() assert len(v) == len(v_filter), "Data for filtered and unfiltered vis have unequal length." sig = v_filter_size / v_size # significance factor # Euclidean distance as L2 function rankSig = 1 # category measure value ranking significance factor # if the vis is a barchart, count how many categories' rank, based on measure value, changes after the filter is applied if vis.mark == "bar": dimList = vis.get_attr_by_data_model("dimension") # use Pandas rank function to calculate rank positions for each category v_rank = unfiltered_vis.data.rank() v_filter_rank = vis.data.rank() # go through and count the number of ranking changes between the filtered and unfiltered data numCategories = ldf.cardinality[dimList[0].attribute] for r in range(0, numCategories - 1): if v_rank[msr_attribute][r] != v_filter_rank[msr_attribute][r]: rankSig += 1 # normalize ranking significance factor rankSig = rankSig / numCategories from scipy.spatial.distance import euclidean return sig * rankSig * euclidean(v, v_filter)
def test_vis_custom_aggregation_as_numpy_func(global_var): df = pytest.college_df from lux.vis.Vis import Vis import numpy as np vis = Vis(["HighestDegree", lux.Clause("AverageCost", aggregation=np.ptp)], df) assert vis.get_attr_by_data_model("measure")[0].aggregation == np.ptp assert vis.get_attr_by_data_model("measure")[0]._aggregation_name == "ptp"
def determine_encoding(ldf: LuxDataFrame, vis: Vis): ''' Populates Vis with the appropriate mark type and channel information based on ShowMe logic Currently support up to 3 dimensions or measures Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified intent vis : lux.vis.Vis Returns ------- None Notes ----- Implementing automatic encoding from Tableau's VizQL Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007). Show Me: Automatic presentation for visual analysis. IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144. https://doi.org/10.1109/TVCG.2007.70594 ''' # Count number of measures and dimensions ndim = 0 nmsr = 0 filters = [] for clause in vis._inferred_intent: if (clause.value == ""): if (clause.data_model == "dimension"): ndim += 1 elif (clause.data_model == "measure" and clause.attribute != "Record"): nmsr += 1 else: # preserve to add back to _inferred_intent later filters.append(clause) # Helper function (TODO: Move this into utils) def line_or_bar(ldf, dimension: Clause, measure: Clause): dim_type = dimension.data_type # If no aggregation function is specified, then default as average if (measure.aggregation == ""): measure.set_aggregation("mean") if (dim_type == "temporal" or dim_type == "oridinal"): return "line", {"x": dimension, "y": measure} else: # unordered categorical # if cardinality large than 5 then sort bars if ldf.cardinality[dimension.attribute] > 5: dimension.sort = "ascending" return "bar", {"x": measure, "y": dimension} # ShowMe logic + additional heuristics #count_col = Clause( attribute="count()", data_model="measure") count_col = Clause(attribute="Record", aggregation="count", data_model="measure", data_type="quantitative") auto_channel = {} if (ndim == 0 and nmsr == 1): # Histogram with Count measure = vis.get_attr_by_data_model("measure", exclude_record=True)[0] if (len(vis.get_attr_by_attr_name("Record")) < 0): vis._inferred_intent.append(count_col) # If no bin specified, then default as 10 if (measure.bin_size == 0): measure.bin_size = 10 auto_channel = {"x": measure, "y": count_col} vis.mark = "histogram" elif (ndim == 1 and (nmsr == 0 or nmsr == 1)): # Line or Bar Chart if (nmsr == 0): vis._inferred_intent.append(count_col) dimension = vis.get_attr_by_data_model("dimension")[0] measure = vis.get_attr_by_data_model("measure")[0] vis.mark, auto_channel = line_or_bar(ldf, dimension, measure) elif (ndim == 2 and (nmsr == 0 or nmsr == 1)): # Line or Bar chart broken down by the dimension dimensions = vis.get_attr_by_data_model("dimension") d1 = dimensions[0] d2 = dimensions[1] if (ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]): # d1.channel = "color" vis.remove_column_from_spec(d1.attribute) dimension = d2 color_attr = d1 else: if (d1.attribute == d2.attribute): vis._inferred_intent.pop( 0 ) # if same attribute then remove_column_from_spec will remove both dims, we only want to remove one else: vis.remove_column_from_spec(d2.attribute) dimension = d1 color_attr = d2 # Colored Bar/Line chart with Count as default measure if (nmsr == 0): vis._inferred_intent.append(count_col) measure = vis.get_attr_by_data_model("measure")[0] vis.mark, auto_channel = line_or_bar(ldf, dimension, measure) auto_channel["color"] = color_attr elif (ndim == 0 and nmsr == 2): # Scatterplot vis.mark = "scatter" vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1] } elif (ndim == 1 and nmsr == 2): # Scatterplot broken down by the dimension measure = vis.get_attr_by_data_model("measure") m1 = measure[0] m2 = measure[1] vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) color_attr = vis.get_attr_by_data_model("dimension")[0] vis.remove_column_from_spec(color_attr) vis.mark = "scatter" auto_channel = {"x": m1, "y": m2, "color": color_attr} elif (ndim == 0 and nmsr == 3): # Scatterplot with color vis.mark = "scatter" auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1], "color": vis._inferred_intent[2] } relevant_attributes = [ auto_channel[channel].attribute for channel in auto_channel ] relevant_min_max = dict((attr, ldf.min_max[attr]) for attr in relevant_attributes if attr != "Record" and attr in ldf.min_max) vis.min_max = relevant_min_max if (auto_channel != {}): vis = Compiler.enforce_specified_channel(vis, auto_channel) vis._inferred_intent.extend( filters) # add back the preserved filters
def interestingness(vis:Vis ,ldf:LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data)==0: raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) record_attrs = list(filter(lambda x: x.attribute=="Record" and x.data_model=="measure", vis_attrs_specs)) n_record = len(record_attrs) for clause in vis_attrs_specs: if (clause.attribute!="Record"): if (clause.data_model == 'dimension'): n_dim += 1 if (clause.data_model == 'measure'): n_msr += 1 n_filter = len(filter_specs) attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) # Line/Bar Chart #print("r:", n_record, "m:", n_msr, "d:",n_dim) if (n_dim == 1 and (n_msr==0 or n_msr==1)): if (v_size<2): return -1 if (n_filter == 0): return unevenness(vis, ldf, measure_lst, dimension_lst) elif(n_filter==1): return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif (n_dim == 0 and n_msr == 1): if (v_size<2): return -1 if (n_filter == 0): v = vis.data["Number of Records"] return skewness(v) elif (n_filter == 1): return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") # Scatter Plot elif (n_dim == 0 and n_msr == 2): if (v_size<2): return -1 if (n_filter==1): v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size/v_size else: sig = 1 return sig * monotonicity(vis,attr_specs) # Scatterplot colored by Dimension elif (n_dim == 1 and n_msr == 2): if (v_size<5): return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if (C<40): return 1/C else: return -1 # Scatterplot colored by dimension elif (n_dim== 1 and n_msr == 2): return 0.2 # Scatterplot colored by measure elif (n_msr == 3): return 0.1 # colored line and barchart cases elif ((vis.mark == "line" or vis.mark == "bar") and n_dim == 2): return 0.2 # Default else: return -1
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) record_attrs = list( filter( lambda x: x.attribute == "Record" and x.data_model == "measure", vis_attrs_specs, )) n_record = len(record_attrs) for clause in vis_attrs_specs: if clause.attribute != "Record": if clause.data_model == "dimension": n_dim += 1 if clause.data_model == "measure": n_msr += 1 n_filter = len(filter_specs) attr_specs = [ clause for clause in vis_attrs_specs if clause.attribute != "Record" ] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif n_dim == 0 and n_msr == 1: if v_size < 2: return -1 if n_filter == 0 and "Number of Records" in vis.data: if "Number of Records" in vis.data: v = vis.data["Number of Records"] return skewness(v) elif n_filter == 1 and "Number of Records" in vis.data: return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 # Scatter Plot elif n_dim == 0 and n_msr == 2: if v_size < 10: return -1 if vis.mark == "heatmap": return weighted_correlation(vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]) if n_filter == 1: v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(vis, attr_specs) # Scatterplot colored by Dimension elif n_dim == 1 and n_msr == 2: if v_size < 10: return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if C < 40: return 1 / C else: return -1 # Scatterplot colored by dimension elif n_dim == 1 and n_msr == 2: return 0.2 # Scatterplot colored by measure elif n_msr == 3: return 0.1 # colored line and barchart cases elif vis.mark == "line" and n_dim == 2: return 0.15 # for colored bar chart, scoring based on Chi-square test for independence score. # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users elif vis.mark == "bar" and n_dim == 2: from scipy.stats import chi2_contingency measure_column = vis.get_attr_by_data_model("measure")[0].attribute dimension_columns = vis.get_attr_by_data_model("dimension") groupby_column = dimension_columns[0].attribute color_column = dimension_columns[1].attribute contingency_table = [] groupby_cardinality = ldf.cardinality[groupby_column] groupby_unique_vals = ldf.unique_values[groupby_column] for c in range(0, groupby_cardinality): contingency_table.append( vis.data[vis.data[groupby_column] == groupby_unique_vals[c]][measure_column]) score = 0.12 # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in # a category having no counts try: color_cardinality = ldf.cardinality[color_column] # scale down score based on number of categories chi2_score = chi2_contingency(contingency_table)[0] * 0.9**( color_cardinality + groupby_cardinality) score = min(0.10, chi2_score) except ValueError: pass return score # Default else: return -1
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") try: filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) n_dim = vis._ndim n_msr = vis._nmsr n_filter = len(filter_specs) attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) if ( n_dim == 1 and (n_msr == 0 or n_msr == 1) and ldf.current_vis is not None and vis.get_attr_by_channel("y")[0].data_type == "quantitative" and len(ldf.current_vis) == 1 and ldf.current_vis[0].mark == "line" and len(get_filter_specs(ldf.intent)) > 0 ): query_vc = VisList(ldf.current_vis, ldf) query_vis = query_vc[0] preprocess(query_vis) preprocess(vis) return 1 - euclidean_dist(query_vis, vis) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif n_dim == 0 and n_msr == 1: if v_size < 2: return -1 if n_filter == 0 and "Number of Records" in vis.data: if "Number of Records" in vis.data: v = vis.data["Number of Records"] return skewness(v) elif n_filter == 1 and "Number of Records" in vis.data: return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 # Scatter Plot elif n_dim == 0 and n_msr == 2: if v_size < 10: return -1 if vis.mark == "heatmap": return weighted_correlation( vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"] ) if n_filter == 1: v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(vis, attr_specs) # Scatterplot colored by Dimension elif n_dim == 1 and n_msr == 2: if v_size < 10: return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if C < 40: return 1 / C else: return -1 # Scatterplot colored by dimension elif n_dim == 1 and n_msr == 2: return 0.2 # Scatterplot colored by measure elif n_msr == 3: return 0.1 # colored line and barchart cases elif vis.mark == "line" and n_dim == 2: return 0.15 # for colored bar chart, scoring based on Chi-square test for independence score. # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users elif vis.mark == "bar" and n_dim == 2: from scipy.stats import chi2_contingency measure_column = vis.get_attr_by_data_model("measure")[0].attribute dimension_columns = vis.get_attr_by_data_model("dimension") groupby_column = dimension_columns[0].attribute color_column = dimension_columns[1].attribute contingency_tbl = pd.crosstab( vis.data[groupby_column], vis.data[color_column], values=vis.data[measure_column], aggfunc=sum, ) try: color_cardinality = ldf.cardinality[color_column] groupby_cardinality = ldf.cardinality[groupby_column] # scale down score based on number of categories chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** ( color_cardinality + groupby_cardinality ) score = min(0.10, chi2_score) except (ValueError, KeyError): # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts score = -1 return score # Default else: return -1 except: if lux.config.interestingness_fallback: # Supress interestingness related issues warnings.warn(f"An error occurred when computing interestingness for: {vis}") return -1 else: raise
def determine_encoding(ldf: LuxDataFrame, vis: Vis): """ Populates Vis with the appropriate mark type and channel information based on ShowMe logic Currently support up to 3 dimensions or measures Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent vis : lux.vis.Vis Returns ------- None Notes ----- Implementing automatic encoding from Tableau's VizQL Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007). Show Me: Automatic presentation for visual analysis. IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144. https://doi.org/10.1109/TVCG.2007.70594 """ # Count number of measures and dimensions ndim = vis._ndim nmsr = vis._nmsr # preserve to add back to _inferred_intent later filters = utils.get_filter_specs(vis._inferred_intent) # Helper function (TODO: Move this into utils) def line_or_bar_or_geo(ldf, dimension: Clause, measure: Clause): dim_type = dimension.data_type # If no aggregation function is specified, then default as average if measure.aggregation == "": measure.set_aggregation("mean") if dim_type == "temporal" or dim_type == "oridinal": if isinstance(dimension.attribute, pd.Timestamp): # If timestamp, use the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') attr = str(dimension.attribute._date_repr) else: attr = dimension.attribute if ldf.cardinality[attr] == 1: return "bar", {"x": measure, "y": dimension} else: return "line", {"x": dimension, "y": measure} else: # unordered categorical # if cardinality large than 5 then sort bars if ldf.cardinality[dimension.attribute] > 5: dimension.sort = "ascending" if utils.like_geo(dimension.get_attr()): return "geographical", {"x": dimension, "y": measure} return "bar", {"x": measure, "y": dimension} # ShowMe logic + additional heuristics # count_col = Clause( attribute="count()", data_model="measure") count_col = Clause( attribute="Record", aggregation="count", data_model="measure", data_type="quantitative", ) auto_channel = {} if ndim == 0 and nmsr == 1: # Histogram with Count measure = vis.get_attr_by_data_model("measure", exclude_record=True)[0] if len(vis.get_attr_by_attr_name("Record")) < 0: vis._inferred_intent.append(count_col) # If no bin specified, then default as 10 if measure.bin_size == 0: measure.bin_size = 10 auto_channel = {"x": measure, "y": count_col} vis._mark = "histogram" elif ndim == 1 and (nmsr == 0 or nmsr == 1): # Line or Bar Chart if nmsr == 0: vis._inferred_intent.append(count_col) dimension = vis.get_attr_by_data_model("dimension")[0] measure = vis.get_attr_by_data_model("measure")[0] vis._mark, auto_channel = line_or_bar_or_geo( ldf, dimension, measure) elif ndim == 2 and (nmsr == 0 or nmsr == 1): # Line or Bar chart broken down by the dimension dimensions = vis.get_attr_by_data_model("dimension") d1 = dimensions[0] d2 = dimensions[1] if ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]: # d1.channel = "color" vis.remove_column_from_spec(d1.attribute) dimension = d2 color_attr = d1 else: # if same attribute then remove_column_from_spec will remove both dims, we only want to remove one if d1.attribute == d2.attribute: vis._inferred_intent.pop(0) else: vis.remove_column_from_spec(d2.attribute) dimension = d1 color_attr = d2 # Colored Bar/Line chart with Count as default measure if not ldf.pre_aggregated: if nmsr == 0 and not ldf.pre_aggregated: vis._inferred_intent.append(count_col) measure = vis.get_attr_by_data_model("measure")[0] vis._mark, auto_channel = line_or_bar_or_geo( ldf, dimension, measure) auto_channel["color"] = color_attr elif ndim == 0 and nmsr == 2: # Scatterplot vis._mark = "scatter" vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1] } elif ndim == 1 and nmsr == 2: # Scatterplot broken down by the dimension measure = vis.get_attr_by_data_model("measure") m1 = measure[0] m2 = measure[1] vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) color_attr = vis.get_attr_by_data_model("dimension")[0] vis.remove_column_from_spec(color_attr) vis._mark = "scatter" auto_channel = {"x": m1, "y": m2, "color": color_attr} elif ndim == 0 and nmsr == 3: # Scatterplot with color vis._mark = "scatter" auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1], "color": vis._inferred_intent[2], } relevant_attributes = [ auto_channel[channel].attribute for channel in auto_channel ] relevant_min_max = dict((attr, ldf._min_max[attr]) for attr in relevant_attributes if attr != "Record" and attr in ldf._min_max) # Replace scatterplot with heatmap HBIN_START = 5000 if vis.mark == "scatter" and lux.config.heatmap and len( ldf) > HBIN_START: vis._postbin = True ldf._message.add_unique( f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.", priority=98, ) vis._mark = "heatmap" vis._min_max = relevant_min_max if auto_channel != {}: vis = Compiler.enforce_specified_channel(vis, auto_channel) vis._inferred_intent.extend( filters) # add back the preserved filters