def execute_filter(vis: Vis) -> bool: """ Apply a Vis's filter to vis.data Parameters ---------- vis : Vis Returns ------- bool Boolean flag indicating if any filter was applied """ assert ( vis.data is not None ), "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)" filters = utils.get_filter_specs(vis._inferred_intent) if filters: # TODO: Need to handle OR logic for filter in filters: vis._vis_data = PandasExecutor.apply_filter( vis.data, filter.attribute, filter.filter_op, filter.value ) return True else: return False
def enhance(ldf): """ Given a set of vis, generates possible visualizations when an additional attribute is added to the current vis. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Enhance action. """ filters = utils.get_filter_specs(ldf._intent) # Collect variables that already exist in the intent attr_specs = list( filter(lambda x: x.value == "" and x.attribute != "Record", ldf._intent)) fltr_str = [ fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters ] attr_str = [str(clause.attribute) for clause in attr_specs] intended_attrs = f'<p class="highlight-intent">{", ".join(attr_str + fltr_str)}</p>' if len(attr_specs) == 1: recommendation = { "action": "Enhance", "description": f"Augmenting current {intended_attrs} intent with additional attribute.", } elif len(attr_specs) == 2: recommendation = { "action": "Enhance", "description": f"Further breaking down current {intended_attrs} intent by additional attribute.", } # if there are too many column attributes, return don't generate Enhance recommendations elif len(attr_specs) > 2: recommendation = {"action": "Enhance"} recommendation["collection"] = [] return recommendation intent = ldf._intent.copy() # Clear channel so that channel not enforced based on input vis intent for clause in intent: clause.channel = "" intent = filters + attr_specs intent.append("?") vlist = lux.vis.VisList.VisList(intent, ldf) # Then use the data populated in the vis list to compute score for vis in vlist: vis.score = interestingness(vis, ldf) vlist.sort() vlist = vlist.showK() recommendation["collection"] = vlist return recommendation
def execute_filter(view: Vis): """ Helper function to convert a Vis' filter specification to a SQL where clause. Takes in a Vis object and returns an appropriate SQL WHERE clause based on the filters specified in the vis' _inferred_intent. Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization Returns ------- where_clause: string String representation of a SQL WHERE clause filter_vars: list of strings list of variables that have been used as filters """ where_clause = [] filters = utils.get_filter_specs(view._inferred_intent) filter_vars = [] if filters: for f in range(0, len(filters)): if f == 0: where_clause.append("WHERE") else: where_clause.append("AND") curr_value = str(filters[f].value) curr_value = curr_value.replace("'", "''") where_clause.extend( [ '"' + str(filters[f].attribute) + '"', str(filters[f].filter_op), "'" + curr_value + "'", ] ) if filters[f].attribute not in filter_vars: filter_vars.append(filters[f].attribute) attributes = utils.get_attrs_specs(view._inferred_intent) # need to ensure that no null values are included in the data # null values breaks binning queries for a in attributes: if a.attribute != "Record": if where_clause == []: where_clause.append("WHERE") else: where_clause.append("AND") where_clause.extend( [ '"' + str(a.attribute) + '"', "IS NOT NULL", ] ) if where_clause == []: return ("", []) else: where_clause = " ".join(where_clause) return (where_clause, filter_vars)
def execute_filter(view: View): assert view.data is not None, "execute_filter assumes input view.data is populated (if not, populate with LuxDataFrame values)" filters = utils.get_filter_specs(view.spec_lst) if (filters): # TODO: Need to handle OR logic for filter in filters: view.data = PandasExecutor.apply_filter( view.data, filter.attribute, filter.filter_op, filter.value)
def context_to_JSON(context): from lux.utils import utils filter_specs = utils.get_filter_specs(context) attrs_specs = utils.get_attrs_specs(context) specs = {} specs['attributes'] = [spec.attribute for spec in attrs_specs] specs['filters'] = [spec.attribute for spec in filter_specs] return specs
def intent_to_JSON(intent): from lux.utils import utils filter_specs = utils.get_filter_specs(intent) attrs_specs = utils.get_attrs_specs(intent) intent = {} intent["attributes"] = [clause.attribute for clause in attrs_specs] intent["filters"] = [clause.attribute for clause in filter_specs] return intent
def execute_filter(vis: Vis): assert vis.data is not None, "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)" filters = utils.get_filter_specs(vis._inferred_intent) if (filters): # TODO: Need to handle OR logic for filter in filters: vis._vis_data = PandasExecutor.apply_filter(vis.data, filter.attribute, filter.filter_op, filter.value) return True else: return False
def test_filter_inequality(): df = pd.read_csv("lux/data/car.csv") df["Year"] = pd.to_datetime(df["Year"], format='%Y') df.set_intent([lux.Clause(attribute = "Horsepower"),lux.Clause(attribute = "MilesPerGal"),lux.Clause(attribute = "Acceleration", filter_op=">",value = 10)]) df._repr_html_() from lux.utils.utils import get_filter_specs complement_vis = df.recommendation["Filter"][0] fltr_clause = get_filter_specs(complement_vis._intent)[0] assert fltr_clause.filter_op =="<=" assert fltr_clause.value ==10
def enhance(ldf): #for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() ''' Given a set of views, generates possible visualizations when an additional attribute is added to the current view. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified context. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Enhance action. ''' recommendation = { "action": "Enhance", "description": "Shows possible visualizations when an additional attribute is added to the current view." } filters = utils.get_filter_specs(ldf.context) # Collect variables that already exist in the context attr_specs = list( filter(lambda x: x.value == "" and x.attribute != "Record", ldf.context)) if ( len(attr_specs) > 2 ): # if there are too many column attributes, return don't generate Enhance recommendations recommendation["collection"] = [] return recommendation query = ldf.context.copy() query = filters + attr_specs query.append("?") vc = lux.view.ViewCollection.ViewCollection(query) vc = vc.load(ldf) # Then use the data populated in the view collection to compute score for view in vc: view.score = interestingness(view, ldf) vc = vc.topK(15) recommendation["collection"] = vc #for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed enhance action in {toc - tic:0.4f} seconds") return recommendation
def test_filter_inequality(global_var): df = pytest.car_df df["Year"] = pd.to_datetime(df["Year"], format="%Y") df.set_intent([ lux.Clause(attribute="Horsepower"), lux.Clause(attribute="MilesPerGal"), lux.Clause(attribute="Acceleration", filter_op=">", value=10), ]) df._ipython_display_() from lux.utils.utils import get_filter_specs complement_vis = df.recommendation["Filter"][0] fltr_clause = get_filter_specs(complement_vis._intent)[0] assert fltr_clause.filter_op == "<=" assert fltr_clause.value == 10
def execute_filter(view: Vis): """ Helper function to convert a Vis' filter specification to a SQL where clause. Takes in a Vis object and returns an appropriate SQL WHERE clause based on the filters specified in the vis' _inferred_intent. Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization Returns ------- where_clause: string String representation of a SQL WHERE clause filter_vars: list of strings list of variables that have been used as filters """ filters = utils.get_filter_specs(view._inferred_intent) return SQLExecutor.create_where_clause(filters, view=view)
def execute(view_collection: ViewCollection, ldf: LuxDataFrame): import pandas as pd ''' Given a ViewCollection, fetch the data required to render the view 1) Apply filters 2) Retreive relevant attribute 3) return a DataFrame with relevant results ''' for view in view_collection: print(view, utils.get_filter_specs(view.spec_lst)) # Select relevant data based on attribute information attributes = set([]) for spec in view.spec_lst: if (spec.attribute): if (spec.attribute == "Record"): attributes.add(spec.attribute) #else: attributes.add(spec.attribute) if view.mark not in ["bar", "line", "histogram"]: where_clause, filterVars = SQLExecutor.execute_filter(view) required_variables = attributes | set(filterVars) required_variables = ",".join(required_variables) row_count = list( pd.read_sql( "SELECT COUNT(*) FROM {} {}".format( ldf.table_name, where_clause), ldf.SQLconnection)['count'])[0] if row_count > 10000: query = "SELECT {} FROM {} {} ORDER BY random() LIMIT 10000".format( required_variables, ldf.table_name, where_clause) else: query = "SELECT {} FROM {} {}".format( required_variables, ldf.table_name, where_clause) data = pd.read_sql(query, ldf.SQLconnection) view.data = utils.pandas_to_lux(data) if (view.mark == "bar" or view.mark == "line"): SQLExecutor.execute_aggregate(view, ldf) elif (view.mark == "histogram"): SQLExecutor.execute_binning(view, ldf)
def execute_filter(view: Vis): where_clause = [] filters = utils.get_filter_specs(view._inferred_intent) filter_vars = [] if (filters): for f in range(0, len(filters)): if f == 0: where_clause.append("WHERE") else: where_clause.append("AND") where_clause.extend([ str(filters[f].attribute), str(filters[f].filter_op), "'" + str(filters[f].value) + "'" ]) if filters[f].attribute not in filter_vars: filter_vars.append(filters[f].attribute) if where_clause == []: return ("", []) else: where_clause = " ".join(where_clause) return (where_clause, filter_vars)
def generalize(ldf): #for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() ''' Generates all possible visualizations when one attribute or filter from the current view is removed. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified context. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Generalize action. ''' # takes in a dataObject and generates a list of new dataObjects, each with a single measure from the original object removed # --> return list of dataObjects with corresponding interestingness scores recommendation = { "action": "Generalize", "description": "Remove one attribute or filter to observe a more general trend." } output = [] excluded_columns = [] column_spec = list( filter(lambda x: x.value == "" and x.attribute != "Record", ldf.context)) row_specs = utils.get_filter_specs(ldf.context) # if we do no have enough column attributes or too many, return no views. if (len(column_spec) < 2 or len(column_spec) > 4): recommendation["collection"] = [] return recommendation for spec in column_spec: columns = spec.attribute if type(columns) == list: for column in columns: if column not in excluded_columns: temp_view = View(ldf.context) temp_view.remove_column_from_spec_new(column, remove_first=True) excluded_columns.append(column) output.append(temp_view) elif type(columns) == str: if columns not in excluded_columns: temp_view = View(ldf.context) temp_view.remove_column_from_spec_new(columns, remove_first=True) excluded_columns.append(columns) output.append(temp_view) for i, spec in enumerate(row_specs): new_spec = ldf.context.copy() new_spec.pop(i) temp_view = View(new_spec) output.append(temp_view) vc = lux.view.ViewCollection.ViewCollection(output) vc = vc.load(ldf) recommendation["collection"] = vc for view in vc: view.score = interestingness(view, ldf) vc.sort(remove_invalid=True) vc.remove_duplicates() #for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed generalize action in {toc - tic:0.4f} seconds") return recommendation
def enhance(ldf): #for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() ''' Given a set of views, generates possible visualizations when an additional attribute is added to the current vis. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Enhance action. ''' filters = utils.get_filter_specs(ldf.intent) # Collect variables that already exist in the intent attr_specs = list( filter(lambda x: x.value == "" and x.attribute != "Record", ldf.intent)) fltr_str = [ fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters ] attr_str = [clause.attribute for clause in attr_specs] intended_attrs = '<p class="highlight-intent">' + ', '.join( attr_str + fltr_str) + '</p>' if (len(attr_specs) == 1): recommendation = { "action": "Enhance", "description": f"Augmenting current {intended_attrs} intent with additional attribute." } elif (len(attr_specs) == 2): recommendation = { "action": "Enhance", "description": f"Further breaking down current {intended_attrs} intent by additional attribute." } elif ( len(attr_specs) > 2 ): # if there are too many column attributes, return don't generate Enhance recommendations recommendation = {"action": "Enhance"} recommendation["collection"] = [] return recommendation intent = ldf.intent.copy() intent = filters + attr_specs intent.append("?") vc = lux.vis.VisList.VisList(intent, ldf) # Then use the data populated in the vis list to compute score for view in vc: view.score = interestingness(view, ldf) vc = vc.topK(15) recommendation["collection"] = vc #for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed enhance action in {toc - tic:0.4f} seconds") return recommendation
def filter(ldf): """ Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. """ filters = utils.get_filter_specs(ldf._intent) filter_values = [] output = [] # if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical variable column_spec = utils.get_attrs_specs(ldf.current_vis[0]._inferred_intent) column_spec_attr = map(lambda x: x.attribute, column_spec) if len(filters) == 1: # get unique values for all categorical values specified and creates corresponding filters fltr = filters[0] if ldf.data_type_lookup[fltr.attribute] == "nominal": recommendation = { "action": "Filter", "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value.", } unique_values = ldf.unique_values[fltr.attribute] filter_values.append(fltr.value) # creates vis with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Clause(attribute=fltr.attribute, value=val) new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) elif ldf.data_type_lookup[fltr.attribute] == "quantitative": recommendation = { "action": "Filter", "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative inequality operation.", } def get_complementary_ops(fltr_op): if fltr_op == ">": return "<=" elif fltr_op == "<": return ">=" elif fltr_op == ">=": return "<" elif fltr_op == "<=": return ">" # TODO: need to support case where fltr_op is "=" --> auto-binned ranges # Create vis with complementary filter operations new_spec = column_spec.copy() new_filter = lux.Clause( attribute=fltr.attribute, filter_op=get_complementary_ops(fltr.filter_op), value=fltr.value, ) new_spec.append(new_filter) temp_vis = Vis(new_spec, score=1) output.append(temp_vis) # if no existing filters, create filters using unique values from all categorical variables in the dataset else: intended_attrs = ", ".join([ clause.attribute for clause in ldf._intent if clause.value == "" and clause.attribute != "Record" ]) recommendation = { "action": "Filter", "description": f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent.", } categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if ldf.cardinality[col] < 30 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for i in range(0, len(unique_values)): new_spec = column_spec.copy() new_filter = lux.Clause(attribute=cat, filter_op="=", value=unique_values[i]) new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) vlist = lux.vis.VisList.VisList(output, ldf) for vis in vlist: vis.score = interestingness(vis, ldf) vlist = vlist.topK(15) recommendation["collection"] = vlist return recommendation
def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True): """ Generates bivariate visualizations that represent all pairwise relationships in the data. Parameters ---------- ldf : LuxDataFrame LuxDataFrame with underspecified intent. ignore_transpose: bool Boolean flag to ignore pairs of attributes whose transpose are already computed (i.e., {X,Y} will be ignored if {Y,X} is already computed) Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Correlation action. """ import numpy as np filter_specs = utils.get_filter_specs(ldf._intent) intent = [ lux.Clause("?", data_model="measure"), lux.Clause("?", data_model="measure"), ] intent.extend(filter_specs) vlist = VisList(intent, ldf) recommendation = { "action": "Correlation", "description": "Show relationships between two <p class='highlight-descriptor'>quantitative</p> attributes.", } ignore_rec_flag = False # Doesn't make sense to compute correlation if less than 4 data values if len(ldf) < 5: ignore_rec_flag = True # Then use the data populated in the vis list to compute score for vis in vlist: measures = vis.get_attr_by_data_model("measure") if len(measures) < 2: raise ValueError( f"Can not compute correlation between {[x.attribute for x in ldf.columns]} since less than 2 measure values present." ) msr1 = measures[0].attribute msr2 = measures[1].attribute if ignore_transpose: check_transpose = check_transpose_not_computed(vlist, msr1, msr2) else: check_transpose = True if check_transpose: vis.score = interestingness(vis, ldf) else: vis.score = -1 if ignore_rec_flag: recommendation["collection"] = [] return recommendation vlist.sort() vlist = vlist.showK() recommendation["collection"] = vlist return recommendation
def interestingness(view: View, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the view. The interestingness metric is dependent on the view type. Parameters ---------- view : View ldf : LuxDataFrame Returns ------- int Interestingness Score """ if view.data is None: raise Exception( "View.data needs to be populated before interestingness can be computed. Run Executor.execute(view,ldf)." ) n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(view.spec_lst) view_attrs_specs = utils.get_attrs_specs(view.spec_lst) for spec in view_attrs_specs: if (spec.attribute != "Record"): if (spec.data_model == 'dimension'): n_dim += 1 if (spec.data_model == 'measure'): n_msr += 1 n_filter = len(filter_specs) attr_specs = [ spec for spec in view_attrs_specs if spec.attribute != "Record" ] dimension_lst = view.get_attr_by_data_model("dimension") measure_lst = view.get_attr_by_data_model("measure") # Bar Chart if (n_dim == 1 and (n_msr == 0 or n_msr == 1)): if (n_filter == 0): return unevenness(view, ldf, measure_lst, dimension_lst) elif (n_filter == 1): return deviation_from_overall(view, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif (n_dim == 0 and n_msr == 1): if (n_filter == 0): v = view.data["Count of Records"] return skewness(v) elif (n_filter == 1): return deviation_from_overall(view, ldf, filter_specs, "Count of Records") # Scatter Plot elif (n_dim == 0 and n_msr == 2): if (n_filter == 1): v_filter_size = get_filtered_size(filter_specs, view.data) v_size = len(view.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(view, attr_specs) # Scatterplot colored by Dimension elif (n_dim == 1 and n_msr == 2): color_attr = view.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if (C < 40): return 1 / C else: return -1 # Scatterplot colored by dimension elif (n_dim == 1 and n_msr == 2): return 0.2 # Scatterplot colored by measure elif (n_msr == 3): return 0.1 # Default else: return -1
def univariate(ldf, data_type_constraint="quantitative"): ''' Generates bar chart distributions of different attributes in the dataframe. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. data_type_constraint: str Controls the type of distribution chart that will be rendered. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Distribution action. ''' import numpy as np filter_specs = utils.get_filter_specs(ldf._intent) ignore_rec_flag = False if (data_type_constraint == "quantitative"): intent = [ lux.Clause("?", data_type="quantitative", exclude="Number of Records") ] intent.extend(filter_specs) recommendation = { "action": "Distribution", "description": "Show univariate histograms of <p class='highlight-descriptor'>quantitative</p> attributes." } if ( len(ldf) < 5 ): # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated) ignore_rec_flag = True elif (data_type_constraint == "nominal"): intent = [lux.Clause("?", data_type="nominal")] intent.extend(filter_specs) recommendation = { "action": "Occurrence", "description": "Show frequency of occurrence for <p class='highlight-descriptor'>categorical</p> attributes." } elif (data_type_constraint == "temporal"): intent = [lux.Clause("?", data_type="temporal")] intent.extend(filter_specs) recommendation = { "action": "Temporal", "description": "Show trends over <p class='highlight-descriptor'>time-related</p> attributes." } if ( len(ldf) < 3 ): # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated) ignore_rec_flag = True if (ignore_rec_flag): recommendation["collection"] = [] return recommendation vlist = VisList(intent, ldf) for vis in vlist: vis.score = interestingness(vis, ldf) # vlist = vlist.topK(15) # Basic visualizations should not be capped vlist.sort() recommendation["collection"] = vlist return recommendation
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) record_attrs = list( filter( lambda x: x.attribute == "Record" and x.data_model == "measure", vis_attrs_specs, )) n_record = len(record_attrs) for clause in vis_attrs_specs: if clause.attribute != "Record": if clause.data_model == "dimension": n_dim += 1 if clause.data_model == "measure": n_msr += 1 n_filter = len(filter_specs) attr_specs = [ clause for clause in vis_attrs_specs if clause.attribute != "Record" ] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif n_dim == 0 and n_msr == 1: if v_size < 2: return -1 if n_filter == 0 and "Number of Records" in vis.data: if "Number of Records" in vis.data: v = vis.data["Number of Records"] return skewness(v) elif n_filter == 1 and "Number of Records" in vis.data: return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 # Scatter Plot elif n_dim == 0 and n_msr == 2: if v_size < 10: return -1 if vis.mark == "heatmap": return weighted_correlation(vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]) if n_filter == 1: v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(vis, attr_specs) # Scatterplot colored by Dimension elif n_dim == 1 and n_msr == 2: if v_size < 10: return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if C < 40: return 1 / C else: return -1 # Scatterplot colored by dimension elif n_dim == 1 and n_msr == 2: return 0.2 # Scatterplot colored by measure elif n_msr == 3: return 0.1 # colored line and barchart cases elif vis.mark == "line" and n_dim == 2: return 0.15 # for colored bar chart, scoring based on Chi-square test for independence score. # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users elif vis.mark == "bar" and n_dim == 2: from scipy.stats import chi2_contingency measure_column = vis.get_attr_by_data_model("measure")[0].attribute dimension_columns = vis.get_attr_by_data_model("dimension") groupby_column = dimension_columns[0].attribute color_column = dimension_columns[1].attribute contingency_table = [] groupby_cardinality = ldf.cardinality[groupby_column] groupby_unique_vals = ldf.unique_values[groupby_column] for c in range(0, groupby_cardinality): contingency_table.append( vis.data[vis.data[groupby_column] == groupby_unique_vals[c]][measure_column]) score = 0.12 # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in # a category having no counts try: color_cardinality = ldf.cardinality[color_column] # scale down score based on number of categories chi2_score = chi2_contingency(contingency_table)[0] * 0.9**( color_cardinality + groupby_cardinality) score = min(0.10, chi2_score) except ValueError: pass return score # Default else: return -1
def interestingness(vis:Vis ,ldf:LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data)==0: raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) record_attrs = list(filter(lambda x: x.attribute=="Record" and x.data_model=="measure", vis_attrs_specs)) n_record = len(record_attrs) for clause in vis_attrs_specs: if (clause.attribute!="Record"): if (clause.data_model == 'dimension'): n_dim += 1 if (clause.data_model == 'measure'): n_msr += 1 n_filter = len(filter_specs) attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) # Line/Bar Chart #print("r:", n_record, "m:", n_msr, "d:",n_dim) if (n_dim == 1 and (n_msr==0 or n_msr==1)): if (v_size<2): return -1 if (n_filter == 0): return unevenness(vis, ldf, measure_lst, dimension_lst) elif(n_filter==1): return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif (n_dim == 0 and n_msr == 1): if (v_size<2): return -1 if (n_filter == 0): v = vis.data["Number of Records"] return skewness(v) elif (n_filter == 1): return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") # Scatter Plot elif (n_dim == 0 and n_msr == 2): if (v_size<2): return -1 if (n_filter==1): v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size/v_size else: sig = 1 return sig * monotonicity(vis,attr_specs) # Scatterplot colored by Dimension elif (n_dim == 1 and n_msr == 2): if (v_size<5): return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if (C<40): return 1/C else: return -1 # Scatterplot colored by dimension elif (n_dim== 1 and n_msr == 2): return 0.2 # Scatterplot colored by measure elif (n_msr == 3): return 0.1 # colored line and barchart cases elif ((vis.mark == "line" or vis.mark == "bar") and n_dim == 2): return 0.2 # Default else: return -1
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") try: filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) n_dim = vis._ndim n_msr = vis._nmsr n_filter = len(filter_specs) attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) if ( n_dim == 1 and (n_msr == 0 or n_msr == 1) and ldf.current_vis is not None and vis.get_attr_by_channel("y")[0].data_type == "quantitative" and len(ldf.current_vis) == 1 and ldf.current_vis[0].mark == "line" and len(get_filter_specs(ldf.intent)) > 0 ): query_vc = VisList(ldf.current_vis, ldf) query_vis = query_vc[0] preprocess(query_vis) preprocess(vis) return 1 - euclidean_dist(query_vis, vis) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif n_dim == 0 and n_msr == 1: if v_size < 2: return -1 if n_filter == 0 and "Number of Records" in vis.data: if "Number of Records" in vis.data: v = vis.data["Number of Records"] return skewness(v) elif n_filter == 1 and "Number of Records" in vis.data: return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 # Scatter Plot elif n_dim == 0 and n_msr == 2: if v_size < 10: return -1 if vis.mark == "heatmap": return weighted_correlation( vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"] ) if n_filter == 1: v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(vis, attr_specs) # Scatterplot colored by Dimension elif n_dim == 1 and n_msr == 2: if v_size < 10: return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if C < 40: return 1 / C else: return -1 # Scatterplot colored by dimension elif n_dim == 1 and n_msr == 2: return 0.2 # Scatterplot colored by measure elif n_msr == 3: return 0.1 # colored line and barchart cases elif vis.mark == "line" and n_dim == 2: return 0.15 # for colored bar chart, scoring based on Chi-square test for independence score. # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users elif vis.mark == "bar" and n_dim == 2: from scipy.stats import chi2_contingency measure_column = vis.get_attr_by_data_model("measure")[0].attribute dimension_columns = vis.get_attr_by_data_model("dimension") groupby_column = dimension_columns[0].attribute color_column = dimension_columns[1].attribute contingency_tbl = pd.crosstab( vis.data[groupby_column], vis.data[color_column], values=vis.data[measure_column], aggfunc=sum, ) try: color_cardinality = ldf.cardinality[color_column] groupby_cardinality = ldf.cardinality[groupby_column] # scale down score based on number of categories chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** ( color_cardinality + groupby_cardinality ) score = min(0.10, chi2_score) except (ValueError, KeyError): # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts score = -1 return score # Default else: return -1 except: if lux.config.interestingness_fallback: # Supress interestingness related issues warnings.warn(f"An error occurred when computing interestingness for: {vis}") return -1 else: raise
def univariate(ldf, *args): """ Generates bar chart distributions of different attributes in the dataframe. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. data_type_constraint: str Controls the type of distribution chart that will be rendered. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Distribution action. """ import numpy as np if len(args) == 0: data_type_constraint = "quantitative" else: data_type_constraint = args[0][0] filter_specs = utils.get_filter_specs(ldf._intent) ignore_rec_flag = False if data_type_constraint == "quantitative": possible_attributes = [ c for c in ldf.columns if ldf.data_type[c] == "quantitative" and ldf.cardinality[c] > 5 and c != "Number of Records" ] intent = [lux.Clause(possible_attributes)] intent.extend(filter_specs) recommendation = { "action": "Distribution", "description": "Show univariate histograms of <p class='highlight-descriptor'>quantitative</p> attributes.", } # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated) if len(ldf) < 5: ignore_rec_flag = True elif data_type_constraint == "nominal": intent = [lux.Clause("?", data_type="nominal")] intent.extend(filter_specs) recommendation = { "action": "Occurrence", "description": "Show frequency of occurrence for <p class='highlight-descriptor'>categorical</p> attributes.", } elif data_type_constraint == "temporal": intent = [lux.Clause("?", data_type="temporal")] intent.extend(filter_specs) recommendation = { "action": "Temporal", "description": "Show trends over <p class='highlight-descriptor'>time-related</p> attributes.", } # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated) if len(ldf) < 3: ignore_rec_flag = True if ignore_rec_flag: recommendation["collection"] = [] return recommendation vlist = VisList(intent, ldf) for vis in vlist: vis.score = interestingness(vis, ldf) vlist.sort() recommendation["collection"] = vlist return recommendation
def generalize(ldf): """ Generates all possible visualizations when one attribute or filter from the current vis is removed. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Generalize action. """ # takes in a dataObject and generates a list of new dataObjects, each with a single measure from the original object removed # --> return list of dataObjects with corresponding interestingness scores output = [] excluded_columns = [] attributes = list( filter(lambda x: x.value == "" and x.attribute != "Record", ldf._intent)) filters = utils.get_filter_specs(ldf._intent) fltr_str = [ fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters ] attr_str = [clause.attribute for clause in attributes] intended_attrs = ('<p class="highlight-intent">' + ", ".join(attr_str + fltr_str) + "</p>") recommendation = { "action": "Generalize", "description": f"Remove an attribute or filter from {intended_attrs}.", } # to observe a more general trend # if we do no have enough column attributes or too many, return no vis. if len(attributes) < 1 or len(attributes) > 4: recommendation["collection"] = [] return recommendation # for each column specification, create a copy of the ldf's vis and remove the column specification # then append the vis to the output if len(attributes) > 1: for clause in attributes: columns = clause.attribute if type(columns) == list: for column in columns: if column not in excluded_columns: temp_vis = Vis(ldf.copy_intent(), score=1) temp_vis.remove_column_from_spec(column, remove_first=True) excluded_columns.append(column) output.append(temp_vis) elif type(columns) == str: if columns not in excluded_columns: temp_vis = Vis(ldf.copy_intent(), score=1) temp_vis.remove_column_from_spec(columns, remove_first=True) excluded_columns.append(columns) output.append(temp_vis) # for each filter specification, create a copy of the ldf's current vis and remove the filter specification, # then append the vis to the output for clause in filters: # new_spec = ldf._intent.copy() # new_spec.remove_column_from_spec(new_spec.attribute) temp_vis = Vis( ldf.current_vis[0]._inferred_intent.copy(), source=ldf, title="Overall", score=0, ) temp_vis.remove_filter_from_spec(clause.value) output.append(temp_vis) vlist = lux.vis.VisList.VisList(output, source=ldf) # Ignore interestingness sorting since Generalize yields very few vis (preserve order of remove attribute, then remove filters) # for vis in vlist: # vis.score = interestingness(vis,ldf) vlist.remove_duplicates() vlist.sort(remove_invalid=True) recommendation["collection"] = vlist return recommendation
def filter(ldf): #for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() ''' Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified context. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. ''' recommendation = { "action": "Filter", "description": "Shows possible visualizations when filtered by categorical variables in the dataset." } filters = utils.get_filter_specs(ldf.context) filter_values = [] output = [] #if Row is specified, create visualizations where data is filtered by all values of the Row's categorical variable column_spec = utils.get_attrs_specs(ldf.current_view[0].spec_lst) column_spec_attr = map(lambda x: x.attribute, column_spec) if len(filters) > 0: #get unique values for all categorical values specified and creates corresponding filters for row in filters: unique_values = ldf.unique_values[row.attribute] filter_values.append(row.value) #creates views with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Spec(attribute=row.attribute, value=val) new_spec.append(new_filter) temp_view = View(new_spec) output.append(temp_view) else: #if no existing filters, create filters using unique values from all categorical variables in the dataset categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if ldf.cardinality[col] < 40 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for i in range(0, len(unique_values)): new_spec = column_spec.copy() new_filter = lux.Spec(attribute=cat, filter_op="=", value=unique_values[i]) new_spec.append(new_filter) temp_view = View(new_spec) output.append(temp_view) vc = lux.view.ViewCollection.ViewCollection(output) vc = vc.load(ldf) for view in vc: view.score = interestingness(view, ldf) vc = vc.topK(15) recommendation["collection"] = vc #for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed filter action in {toc - tic:0.4f} seconds") return recommendation
def add_filter(ldf): """ Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. """ filters = utils.get_filter_specs(ldf._intent) filter_values = [] output = [] # if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical # variable column_spec = utils.get_attrs_specs(ldf.current_vis[0].intent) column_spec_attr = list(map(lambda x: x.attribute, column_spec)) if len(filters) == 1: # get unique values for all categorical values specified and creates corresponding filters fltr = filters[0] if ldf.data_type[fltr.attribute] == "nominal": recommendation = { "action": "Filter", "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an " f"alternative value.", "long_description": f"Swap out the filter value for {fltr.attribute} to other possible values, while " f"keeping all else the same. Visualizations are ranked based on interestingness", } unique_values = ldf.unique_values[fltr.attribute] filter_values.append(fltr.value) # creates vis with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Clause(attribute=fltr.attribute, value=val) new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) elif ldf.data_type[fltr.attribute] == "quantitative": recommendation = { "action": "Filter", "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an " f"alternative inequality operation.", "long_description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an " f"alternative inequality operation.", } # Create vis with complementary filter operations # NOTE: This section of code has been modified to allow for the rendering of multiple vis for op in get_complementary_ops(fltr.filter_op): new_spec = column_spec.copy() new_filter = lux.Clause( attribute=fltr.attribute, filter_op=op, value=fltr.value, ) new_spec.append(new_filter) temp_vis = Vis(new_spec, score=1) output.append(temp_vis) # if no existing filters, create filters using unique values from all categorical variables in the dataset else: intended_attrs = ", ".join( [ str(clause.attribute) for clause in ldf._intent if clause.value == "" and clause.attribute != "Record" ] ) recommendation = { "action": "Filter", "description": f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent.", "long_description": f"Adding any filter while keeping the attributes on the x and y axes fixed. " f"Visualizations are ranked based on interestingness", } categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if 1 < ldf.cardinality[col] < 30 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for val in unique_values: new_spec = column_spec.copy() new_filter = lux.Clause(attribute=cat, filter_op="=", value=val) new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) if ( ldf.current_vis is not None and len(ldf.current_vis) == 1 and ldf.current_vis[0].mark == "line" and len(get_filter_specs(ldf.intent)) > 0 ): recommendation = { "action": "Similarity", "description": "Show other charts that are visually similar to the Current vis.", "long_description": "Show other charts that are visually similar to the Current vis.", } last = get_filter_specs(ldf.intent)[-1] output = ldf.intent.copy()[0:-1] # array of possible values for attribute arr = ldf[last.attribute].unique().tolist() output.append(lux.Clause(last.attribute, last.attribute, arr)) vlist = lux.vis.VisList.VisList(output, ldf) vlist_copy = lux.vis.VisList.VisList(output, ldf) for i in range(len(vlist_copy)): vlist[i].score = interestingness(vlist_copy[i], ldf) vlist.sort() vlist = vlist.showK() if recommendation["action"] == "Similarity": recommendation["collection"] = vlist[1:] else: recommendation["collection"] = vlist return recommendation
def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True): ''' Generates bivariate visualizations that represent all pairwise relationships in the data. Parameters ---------- ldf : LuxDataFrame LuxDataFrame with underspecified intent. ignore_transpose: bool Boolean flag to ignore pairs of attributes whose transpose are already computed (i.e., {X,Y} will be ignored if {Y,X} is already computed) Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Correlation action. ''' import numpy as np # for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() filter_specs = utils.get_filter_specs(ldf.intent) intent = [ lux.Clause("?", data_model="measure"), lux.Clause("?", data_model="measure") ] intent.extend(filter_specs) vc = VisList(intent, ldf) recommendation = { "action": "Correlation", "description": "Show relationships between two <p class='highlight-descriptor'>quantitative</p> attributes." } ignore_rec_flag = False if ( len(ldf) < 5 ): # Doesn't make sense to compute correlation if less than 4 data values ignore_rec_flag = True # Then use the data populated in the vis list to compute score for view in vc: measures = view.get_attr_by_data_model("measure") if len(measures) < 2: raise ValueError( f"Can not compute correlation between {[x.attribute for x in ldf.columns]} since less than 2 measure values present." ) msr1 = measures[0].attribute msr2 = measures[1].attribute if (ignore_transpose): check_transpose = check_transpose_not_computed(vc, msr1, msr2) else: check_transpose = True if (check_transpose): view.score = interestingness(view, ldf) else: view.score = -1 if (ignore_rec_flag): recommendation["collection"] = [] return recommendation vc = vc.topK(15) recommendation["collection"] = vc # for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed correlation action in {toc - tic:0.4f} seconds") return recommendation
def univariate(ldf, *args): """ Generates bar chart distributions of different attributes in the dataframe. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. data_type_constraint: str Controls the type of distribution chart that will be rendered. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Distribution action. """ import numpy as np if len(args) == 0: data_type_constraint = "quantitative" else: data_type_constraint = args[0][0] filter_specs = utils.get_filter_specs(ldf._intent) ignore_rec_flag = False if data_type_constraint == "quantitative": possible_attributes = [ c for c in ldf.columns if ldf.data_type[c] == "quantitative" and ldf.cardinality[c] > 5 and c != "Number of Records" ] intent = [lux.Clause(possible_attributes)] intent.extend(filter_specs) examples = "" if len(possible_attributes) >= 1: examples = f" (e.g., {possible_attributes[0]})" recommendation = { "action": "Distribution", "description": "Show univariate histograms of <p class='highlight-descriptor'>quantitative</p> attributes.", "long_description": f"Distribution displays univariate histogram distributions of all quantitative attributes{examples}. Visualizations are ranked from most to least skewed.", } # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated) if ldf.length < 5: ignore_rec_flag = True elif data_type_constraint == "nominal": possible_attributes = [ c for c in ldf.columns if ldf.data_type[c] == "nominal" and ldf.cardinality[c] > 5 and c != "Number of Records" ] examples = "" if len(possible_attributes) >= 1: examples = f" (e.g., {possible_attributes[0]})" intent = [lux.Clause("?", data_type="nominal")] intent.extend(filter_specs) recommendation = { "action": "Occurrence", "description": "Show frequency of occurrence for <p class='highlight-descriptor'>categorical</p> attributes.", "long_description": f"Occurence displays bar charts of counts for all categorical attributes{examples}. Visualizations are ranked from most to least uneven across the bars. ", } elif data_type_constraint == "geographical": possible_attributes = [ c for c in ldf.columns if ldf.data_type[c] == "geographical" and ldf.cardinality[c] > 5 and c != "Number of Records" ] examples = "" if len(possible_attributes) >= 1: examples = f" (e.g., {possible_attributes[0]})" intent = [ lux.Clause("?", data_type="geographical"), lux.Clause("?", data_model="measure") ] intent.extend(filter_specs) recommendation = { "action": "Geographical", "description": "Show choropleth maps of <p class='highlight-descriptor'>geographic</p> attributes", "long_description": f"Occurence displays choropleths of averages for some geographic attribute{examples}. Visualizations are ranked by diversity of the geographic attribute.", } elif data_type_constraint == "temporal": intent = [lux.Clause("?", data_type="temporal")] intent.extend(filter_specs) recommendation = { "action": "Temporal", "description": "Show trends over <p class='highlight-descriptor'>time-related</p> attributes.", "long_description": "Temporal displays line charts for all attributes related to datetimes in the dataframe.", } # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated) if ldf.length < 3: ignore_rec_flag = True if ignore_rec_flag: recommendation["collection"] = [] return recommendation vlist = VisList(intent, ldf) for vis in vlist: vis.score = interestingness(vis, ldf) vlist.sort() recommendation["collection"] = vlist return recommendation
def determine_encoding(ldf: LuxDataFrame, vis: Vis): """ Populates Vis with the appropriate mark type and channel information based on ShowMe logic Currently support up to 3 dimensions or measures Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent vis : lux.vis.Vis Returns ------- None Notes ----- Implementing automatic encoding from Tableau's VizQL Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007). Show Me: Automatic presentation for visual analysis. IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144. https://doi.org/10.1109/TVCG.2007.70594 """ # Count number of measures and dimensions ndim = vis._ndim nmsr = vis._nmsr # preserve to add back to _inferred_intent later filters = utils.get_filter_specs(vis._inferred_intent) # Helper function (TODO: Move this into utils) def line_or_bar(ldf, dimension: Clause, measure: Clause): dim_type = dimension.data_type # If no aggregation function is specified, then default as average if measure.aggregation == "": measure.set_aggregation("mean") if dim_type == "temporal" or dim_type == "oridinal": return "line", {"x": dimension, "y": measure} else: # unordered categorical # if cardinality large than 5 then sort bars if ldf.cardinality[dimension.attribute] > 5: dimension.sort = "ascending" return "bar", {"x": measure, "y": dimension} # ShowMe logic + additional heuristics # count_col = Clause( attribute="count()", data_model="measure") count_col = Clause( attribute="Record", aggregation="count", data_model="measure", data_type="quantitative", ) auto_channel = {} if ndim == 0 and nmsr == 1: # Histogram with Count measure = vis.get_attr_by_data_model("measure", exclude_record=True)[0] if len(vis.get_attr_by_attr_name("Record")) < 0: vis._inferred_intent.append(count_col) # If no bin specified, then default as 10 if measure.bin_size == 0: measure.bin_size = 10 auto_channel = {"x": measure, "y": count_col} vis._mark = "histogram" elif ndim == 1 and (nmsr == 0 or nmsr == 1): # Line or Bar Chart if nmsr == 0: vis._inferred_intent.append(count_col) dimension = vis.get_attr_by_data_model("dimension")[0] measure = vis.get_attr_by_data_model("measure")[0] vis._mark, auto_channel = line_or_bar(ldf, dimension, measure) elif ndim == 2 and (nmsr == 0 or nmsr == 1): # Line or Bar chart broken down by the dimension dimensions = vis.get_attr_by_data_model("dimension") d1 = dimensions[0] d2 = dimensions[1] if ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]: # d1.channel = "color" vis.remove_column_from_spec(d1.attribute) dimension = d2 color_attr = d1 else: # if same attribute then remove_column_from_spec will remove both dims, we only want to remove one if d1.attribute == d2.attribute: vis._inferred_intent.pop(0) else: vis.remove_column_from_spec(d2.attribute) dimension = d1 color_attr = d2 # Colored Bar/Line chart with Count as default measure if not ldf.pre_aggregated: if nmsr == 0 and not ldf.pre_aggregated: vis._inferred_intent.append(count_col) measure = vis.get_attr_by_data_model("measure")[0] vis._mark, auto_channel = line_or_bar(ldf, dimension, measure) auto_channel["color"] = color_attr elif ndim == 0 and nmsr == 2: # Scatterplot vis._mark = "scatter" vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1] } elif ndim == 1 and nmsr == 2: # Scatterplot broken down by the dimension measure = vis.get_attr_by_data_model("measure") m1 = measure[0] m2 = measure[1] vis._inferred_intent[0].set_aggregation(None) vis._inferred_intent[1].set_aggregation(None) color_attr = vis.get_attr_by_data_model("dimension")[0] vis.remove_column_from_spec(color_attr) vis._mark = "scatter" auto_channel = {"x": m1, "y": m2, "color": color_attr} elif ndim == 0 and nmsr == 3: # Scatterplot with color vis._mark = "scatter" auto_channel = { "x": vis._inferred_intent[0], "y": vis._inferred_intent[1], "color": vis._inferred_intent[2], } relevant_attributes = [ auto_channel[channel].attribute for channel in auto_channel ] relevant_min_max = dict((attr, ldf._min_max[attr]) for attr in relevant_attributes if attr != "Record" and attr in ldf._min_max) vis._min_max = relevant_min_max if auto_channel != {}: vis = Compiler.enforce_specified_channel(vis, auto_channel) vis._inferred_intent.extend( filters) # add back the preserved filters
def filter(ldf): #for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() ''' Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. ''' filters = utils.get_filter_specs(ldf.intent) filter_values = [] output = [] #if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical variable column_spec = utils.get_attrs_specs(ldf.current_vis[0]._inferred_intent) column_spec_attr = map(lambda x: x.attribute,column_spec) if len(filters) == 1: #get unique values for all categorical values specified and creates corresponding filters fltr = filters[0] unique_values = ldf.unique_values[fltr.attribute] filter_values.append(fltr.value) #creates views with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Clause(attribute = fltr.attribute, value = val) new_spec.append(new_filter) temp_view = Vis(new_spec) output.append(temp_view) recommendation = {"action":"Filter", "description":f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value."} else: #if no existing filters, create filters using unique values from all categorical variables in the dataset intended_attrs = '<b>'+', '.join([clause.attribute for clause in ldf.intent if clause.value=='' and clause.attribute!="Record"])+'</b>' recommendation = {"action":"Filter", "description":f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent."} categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if ldf.cardinality[col]<30 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for i in range(0, len(unique_values)): new_spec = column_spec.copy() new_filter = lux.Clause(attribute=cat, filter_op="=",value=unique_values[i]) new_spec.append(new_filter) temp_view = Vis(new_spec) output.append(temp_view) vc = lux.vis.VisList.VisList(output,ldf) for view in vc: view.score = interestingness(view,ldf) vc = vc.topK(15) recommendation["collection"] = vc #for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed filter action in {toc - tic:0.4f} seconds") return recommendation