def populate_wildcard_options(_inferred_intent: List[Clause], ldf: LuxDataFrame) -> dict: """ Given wildcards and constraints in the LuxDataFrame's intent, return the list of available values that satisfies the data_type or data_model constraints. Parameters ---------- ldf : LuxDataFrame LuxDataFrame with row or attributes populated with available wildcard options. Returns ------- intent: Dict[str,list] a dictionary that holds the attributes and filters generated from wildcards and constraints. """ import copy from lux.utils.utils import convert_to_list intent = {"attributes": [], "filters": []} for clause in _inferred_intent: spec_options = [] if clause.value == "": # attribute if clause.attribute == "?": options = set(list(ldf.columns)) # all attributes if clause.data_type != "": options = options.intersection(set(ldf.data_type[clause.data_type])) if clause.data_model != "": options = options.intersection(set(ldf.data_model[clause.data_model])) options = list(options) else: options = convert_to_list(clause.attribute) for optStr in options: if str(optStr) not in clause.exclude: spec_copy = copy.copy(clause) spec_copy.attribute = optStr spec_options.append(spec_copy) intent["attributes"].append(spec_options) else: # filters attr_lst = convert_to_list(clause.attribute) for attr in attr_lst: options = [] if clause.value == "?": options = ldf.unique_values[attr] specInd = _inferred_intent.index(clause) _inferred_intent[specInd] = Clause( attribute=clause.attribute, filter_op="=", value=list(options), ) else: options.extend(convert_to_list(clause.value)) for optStr in options: if str(optStr) not in clause.exclude: spec_copy = copy.copy(clause) spec_copy.attribute = attr spec_copy.value = optStr spec_options.append(spec_copy) intent["filters"].extend(spec_options) return intent
def determine_encoding(ldf: LuxDataFrame, vis: Vis): ''' Populates Vis with the appropriate mark type and channel information based on ShowMe logic Currently support up to 3 dimensions or measures Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified intent vis : lux.vis.Vis Returns ------- None Notes ----- Implementing automatic encoding from Tableau's VizQL Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007). Show Me: Automatic presentation for visual analysis. IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144. https://doi.org/10.1109/TVCG.2007.70594 ''' # Count number of measures and dimensions ndim = 0 nmsr = 0 filters = [] for clause in vis._inferred_intent: if (clause.value == ""): if (clause.data_model == "dimension"): ndim += 1 elif (clause.data_model == "measure" and clause.attribute != "Record"): nmsr += 1 else: # preserve to add back to _inferred_intent later filters.append(clause) # Helper function (TODO: Move this into utils) def line_or_bar(ldf, dimension: Clause, measure: Clause): dim_type = dimension.data_type # If no aggregation function is specified, then default as average if (measure.aggregation == ""): measure.set_aggregation("mean") if (dim_type == "temporal" or dim_type == "oridinal"): return "line", {"x": dimension, "y": measure} else: # unordered categorical # if cardinality large than 5 then sort bars if ldf.cardinality[dimension.attribute] > 5: dimension.sort = "ascending" return "bar", {"x": measure, "y": dimension} # ShowMe logic + additional heuristics #count_col = Clause( attribute="count()", data_model="measure") count_col = Clause(attribute="Record", aggregation="count", data_model="measure", data_type="quantitative") auto_channel = {} if (ndim == 0 and nmsr == 1): # Histogram with Count measure = vis.get_attr_by_data_model("measure", exclude_record=True)[0] if (len(vis.get_attr_by_attr_name("Record")) < 0): vis._inferred_intent.append(count_col) # If no bin specified, then default as 10 if (measure.bin_size == 0): measure.bin_size = 10 auto_channel = {"x": measure, "y": count_col} vis.mark = "histogram" elif (ndim == 1 and (nmsr == 0 or nmsr == 1)): # Line or Bar Chart if (nmsr == 0): vis._inferred_intent.append(count_col) dimension = vis.get_attr_by_data_model("dimension")[0] measure = vis.get_attr_by_data_model("measure")[0] vis.mark, auto_channel = line_or_bar(ldf, dimension, measure) elif (ndim == 2 and (nmsr == 0 or nmsr == 1)): # Line or Bar chart broken down by the dimension dimensions = vis.get_attr_by_data_model("dimension") d1 = dimensions[0] d2 = dimensions[1] if (ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]): # d1.channel = "color" vis.remove_column_from_spec(d1.attribute) dimension = d2 color_attr = d1 else: if (d1.attribute == d2.attribute): vis._inferred_intent.pop( 0 ) # if same attribute then remove_column_from_spec will remove both dims, we only want to remove one else: vis.remove_column_from_spec(d2.attribute) dimension = d1 color_attr = d2 # Colored Bar/Line chart with Count as default measure if (nmsr == 0): vis._inferred_intent.append(count_col) measure = vis.get_attr_by_data_model("measure")[0] vis.mark, auto_channel = line_or_bar(ldf, dimension, measure) auto_channel["color"] = color_attr elif (ndim == 0 and nmsr == 2): # Scatterplot vis.mark = "scatter" vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1] } elif (ndim == 1 and nmsr == 2): # Scatterplot broken down by the dimension measure = vis.get_attr_by_data_model("measure") m1 = measure[0] m2 = measure[1] vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) color_attr = vis.get_attr_by_data_model("dimension")[0] vis.remove_column_from_spec(color_attr) vis.mark = "scatter" auto_channel = {"x": m1, "y": m2, "color": color_attr} elif (ndim == 0 and nmsr == 3): # Scatterplot with color vis.mark = "scatter" auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1], "color": vis._inferred_intent[2] } relevant_attributes = [ auto_channel[channel].attribute for channel in auto_channel ] relevant_min_max = dict((attr, ldf.min_max[attr]) for attr in relevant_attributes if attr != "Record" and attr in ldf.min_max) vis.min_max = relevant_min_max if (auto_channel != {}): vis = Compiler.enforce_specified_channel(vis, auto_channel) vis._inferred_intent.extend( filters) # add back the preserved filters
def determine_encoding(ldf: LuxDataFrame, vis: Vis): """ Populates Vis with the appropriate mark type and channel information based on ShowMe logic Currently support up to 3 dimensions or measures Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent vis : lux.vis.Vis Returns ------- None Notes ----- Implementing automatic encoding from Tableau's VizQL Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007). Show Me: Automatic presentation for visual analysis. IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144. https://doi.org/10.1109/TVCG.2007.70594 """ # Count number of measures and dimensions ndim = vis._ndim nmsr = vis._nmsr # preserve to add back to _inferred_intent later filters = utils.get_filter_specs(vis._inferred_intent) # Helper function (TODO: Move this into utils) def line_or_bar_or_geo(ldf, dimension: Clause, measure: Clause): dim_type = dimension.data_type # If no aggregation function is specified, then default as average if measure.aggregation == "": measure.set_aggregation("mean") if dim_type == "temporal" or dim_type == "oridinal": if isinstance(dimension.attribute, pd.Timestamp): # If timestamp, use the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') attr = str(dimension.attribute._date_repr) else: attr = dimension.attribute if ldf.cardinality[attr] == 1: return "bar", {"x": measure, "y": dimension} else: return "line", {"x": dimension, "y": measure} else: # unordered categorical # if cardinality large than 5 then sort bars if ldf.cardinality[dimension.attribute] > 5: dimension.sort = "ascending" if utils.like_geo(dimension.get_attr()): return "geographical", {"x": dimension, "y": measure} return "bar", {"x": measure, "y": dimension} # ShowMe logic + additional heuristics # count_col = Clause( attribute="count()", data_model="measure") count_col = Clause( attribute="Record", aggregation="count", data_model="measure", data_type="quantitative", ) auto_channel = {} if ndim == 0 and nmsr == 1: # Histogram with Count measure = vis.get_attr_by_data_model("measure", exclude_record=True)[0] if len(vis.get_attr_by_attr_name("Record")) < 0: vis._inferred_intent.append(count_col) # If no bin specified, then default as 10 if measure.bin_size == 0: measure.bin_size = 10 auto_channel = {"x": measure, "y": count_col} vis._mark = "histogram" elif ndim == 1 and (nmsr == 0 or nmsr == 1): # Line or Bar Chart if nmsr == 0: vis._inferred_intent.append(count_col) dimension = vis.get_attr_by_data_model("dimension")[0] measure = vis.get_attr_by_data_model("measure")[0] vis._mark, auto_channel = line_or_bar_or_geo( ldf, dimension, measure) elif ndim == 2 and (nmsr == 0 or nmsr == 1): # Line or Bar chart broken down by the dimension dimensions = vis.get_attr_by_data_model("dimension") d1 = dimensions[0] d2 = dimensions[1] if ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]: # d1.channel = "color" vis.remove_column_from_spec(d1.attribute) dimension = d2 color_attr = d1 else: # if same attribute then remove_column_from_spec will remove both dims, we only want to remove one if d1.attribute == d2.attribute: vis._inferred_intent.pop(0) else: vis.remove_column_from_spec(d2.attribute) dimension = d1 color_attr = d2 # Colored Bar/Line chart with Count as default measure if not ldf.pre_aggregated: if nmsr == 0 and not ldf.pre_aggregated: vis._inferred_intent.append(count_col) measure = vis.get_attr_by_data_model("measure")[0] vis._mark, auto_channel = line_or_bar_or_geo( ldf, dimension, measure) auto_channel["color"] = color_attr elif ndim == 0 and nmsr == 2: # Scatterplot vis._mark = "scatter" vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1] } elif ndim == 1 and nmsr == 2: # Scatterplot broken down by the dimension measure = vis.get_attr_by_data_model("measure") m1 = measure[0] m2 = measure[1] vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) color_attr = vis.get_attr_by_data_model("dimension")[0] vis.remove_column_from_spec(color_attr) vis._mark = "scatter" auto_channel = {"x": m1, "y": m2, "color": color_attr} elif ndim == 0 and nmsr == 3: # Scatterplot with color vis._mark = "scatter" auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1], "color": vis._inferred_intent[2], } relevant_attributes = [ auto_channel[channel].attribute for channel in auto_channel ] relevant_min_max = dict((attr, ldf._min_max[attr]) for attr in relevant_attributes if attr != "Record" and attr in ldf._min_max) # Replace scatterplot with heatmap HBIN_START = 5000 if vis.mark == "scatter" and lux.config.heatmap and len( ldf) > HBIN_START: vis._postbin = True ldf._message.add_unique( f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.", priority=98, ) vis._mark = "heatmap" vis._min_max = relevant_min_max if auto_channel != {}: vis = Compiler.enforce_specified_channel(vis, auto_channel) vis._inferred_intent.extend( filters) # add back the preserved filters