def execute_filter(view: Vis): """ Helper function to convert a Vis' filter specification to a SQL where clause. Takes in a Vis object and returns an appropriate SQL WHERE clause based on the filters specified in the vis' _inferred_intent. Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization Returns ------- where_clause: string String representation of a SQL WHERE clause filter_vars: list of strings list of variables that have been used as filters """ where_clause = [] filters = utils.get_filter_specs(view._inferred_intent) filter_vars = [] if filters: for f in range(0, len(filters)): if f == 0: where_clause.append("WHERE") else: where_clause.append("AND") curr_value = str(filters[f].value) curr_value = curr_value.replace("'", "''") where_clause.extend( [ '"' + str(filters[f].attribute) + '"', str(filters[f].filter_op), "'" + curr_value + "'", ] ) if filters[f].attribute not in filter_vars: filter_vars.append(filters[f].attribute) attributes = utils.get_attrs_specs(view._inferred_intent) # need to ensure that no null values are included in the data # null values breaks binning queries for a in attributes: if a.attribute != "Record": if where_clause == []: where_clause.append("WHERE") else: where_clause.append("AND") where_clause.extend( [ '"' + str(a.attribute) + '"', "IS NOT NULL", ] ) if where_clause == []: return ("", []) else: where_clause = " ".join(where_clause) return (where_clause, filter_vars)
def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str) -> int: """ Difference in bar chart/histogram shape from overall chart Note: this function assumes that the filtered vis.data is operating on the same range as the unfiltered vis.data. Parameters ---------- vis : Vis ldf : LuxDataFrame filter_specs : list List of filters from the Vis msr_attribute : str The attribute name of the measure value of the chart Returns ------- int Score describing how different the vis is from the overall vis """ v_filter_size = get_filtered_size(filter_specs, ldf) v_size = len(vis.data) v_filter = vis.data[msr_attribute] total = v_filter.sum() v_filter = v_filter / total # normalize by total to get ratio if total == 0: return 0 # Generate an "Overall" Vis (TODO: This is computed multiple times for every vis, alternative is to directly access df.current_vis but we do not have guaruntee that will always be unfiltered vis (in the non-Filter action scenario)) import copy unfiltered_vis = copy.copy(vis) # Remove filters, keep only attribute intent unfiltered_vis._inferred_intent = utils.get_attrs_specs(vis._inferred_intent) ldf.executor.execute([unfiltered_vis], ldf) v = unfiltered_vis.data[msr_attribute] v = v / v.sum() assert len(v) == len(v_filter), "Data for filtered and unfiltered vis have unequal length." sig = v_filter_size / v_size # significance factor # Euclidean distance as L2 function rankSig = 1 # category measure value ranking significance factor # if the vis is a barchart, count how many categories' rank, based on measure value, changes after the filter is applied if vis.mark == "bar": dimList = vis.get_attr_by_data_model("dimension") # use Pandas rank function to calculate rank positions for each category v_rank = unfiltered_vis.data.rank() v_filter_rank = vis.data.rank() # go through and count the number of ranking changes between the filtered and unfiltered data numCategories = ldf.cardinality[dimList[0].attribute] for r in range(0, numCategories - 1): if v_rank[msr_attribute][r] != v_filter_rank[msr_attribute][r]: rankSig += 1 # normalize ranking significance factor rankSig = rankSig / numCategories from scipy.spatial.distance import euclidean return sig * rankSig * euclidean(v, v_filter)
def intent_to_JSON(intent): from lux.utils import utils filter_specs = utils.get_filter_specs(intent) attrs_specs = utils.get_attrs_specs(intent) intent = {} intent["attributes"] = [clause.attribute for clause in attrs_specs] intent["filters"] = [clause.attribute for clause in filter_specs] return intent
def context_to_JSON(context): from lux.utils import utils filter_specs = utils.get_filter_specs(context) attrs_specs = utils.get_attrs_specs(context) specs = {} specs['attributes'] = [spec.attribute for spec in attrs_specs] specs['filters'] = [spec.attribute for spec in filter_specs] return specs
def create_where_clause(filter_specs, view=""): where_clause = [] filter_vars = [] filters = filter_specs if filters: for f in range(0, len(filters)): if f == 0: where_clause.append("WHERE") else: where_clause.append("AND") curr_value = str(filters[f].value) curr_value = curr_value.replace("'", "''") where_clause.extend( [ '"' + str(filters[f].attribute) + '"', str(filters[f].filter_op), "'" + curr_value + "'", ] ) if filters[f].attribute not in filter_vars: filter_vars.append(filters[f].attribute) if view != "": attributes = utils.get_attrs_specs(view._inferred_intent) # need to ensure that no null values are included in the data # null values breaks binning queries for a in attributes: if a.attribute != "Record": if where_clause == []: where_clause.append("WHERE") else: where_clause.append("AND") where_clause.extend( [ '"' + str(a.attribute) + '"', "IS NOT NULL", ] ) if where_clause == []: return ("", []) else: where_clause = " ".join(where_clause) return (where_clause, filter_vars)
def deviation_from_overall(view: View, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str) -> int: """ Difference in bar chart/histogram shape from overall chart Note: this function assumes that the filtered view.data is operating on the same range as the unfiltered view.data. Parameters ---------- view : View ldf : LuxDataFrame filter_specs : list List of filters from the View msr_attribute : str The attribute name of the measure value of the chart Returns ------- int Score describing how different the view is from the overall view """ v_filter_size = get_filtered_size(filter_specs, ldf) v_size = len(view.data) v_filter = view.data[msr_attribute] v_filter = v_filter / v_filter.sum() # normalize by total to get ratio # Generate an "Overall" View (TODO: This is computed multiple times for every view, alternative is to directly access df.current_view but we do not have guaruntee that will always be unfiltered view (in the non-Filter action scenario)) import copy unfiltered_view = copy.copy(view) unfiltered_view.spec_lst = utils.get_attrs_specs( view.spec_lst) # Remove filters, keep only attribute specs ldf.executor.execute([unfiltered_view], ldf) v = unfiltered_view.data[msr_attribute] v = v / v.sum() assert len(v) == len( v_filter), "Data for filtered and unfiltered view have unequal length." sig = v_filter_size / v_size #significance factor # Euclidean distance as L2 function from scipy.spatial.distance import euclidean return sig * euclidean(v, v_filter)
def filter(ldf): """ Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. """ filters = utils.get_filter_specs(ldf._intent) filter_values = [] output = [] # if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical variable column_spec = utils.get_attrs_specs(ldf.current_vis[0]._inferred_intent) column_spec_attr = map(lambda x: x.attribute, column_spec) if len(filters) == 1: # get unique values for all categorical values specified and creates corresponding filters fltr = filters[0] if ldf.data_type_lookup[fltr.attribute] == "nominal": recommendation = { "action": "Filter", "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value.", } unique_values = ldf.unique_values[fltr.attribute] filter_values.append(fltr.value) # creates vis with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Clause(attribute=fltr.attribute, value=val) new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) elif ldf.data_type_lookup[fltr.attribute] == "quantitative": recommendation = { "action": "Filter", "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative inequality operation.", } def get_complementary_ops(fltr_op): if fltr_op == ">": return "<=" elif fltr_op == "<": return ">=" elif fltr_op == ">=": return "<" elif fltr_op == "<=": return ">" # TODO: need to support case where fltr_op is "=" --> auto-binned ranges # Create vis with complementary filter operations new_spec = column_spec.copy() new_filter = lux.Clause( attribute=fltr.attribute, filter_op=get_complementary_ops(fltr.filter_op), value=fltr.value, ) new_spec.append(new_filter) temp_vis = Vis(new_spec, score=1) output.append(temp_vis) # if no existing filters, create filters using unique values from all categorical variables in the dataset else: intended_attrs = ", ".join([ clause.attribute for clause in ldf._intent if clause.value == "" and clause.attribute != "Record" ]) recommendation = { "action": "Filter", "description": f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent.", } categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if ldf.cardinality[col] < 30 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for i in range(0, len(unique_values)): new_spec = column_spec.copy() new_filter = lux.Clause(attribute=cat, filter_op="=", value=unique_values[i]) new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) vlist = lux.vis.VisList.VisList(output, ldf) for vis in vlist: vis.score = interestingness(vis, ldf) vlist = vlist.topK(15) recommendation["collection"] = vlist return recommendation
def interestingness(vis:Vis ,ldf:LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data)==0: raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) record_attrs = list(filter(lambda x: x.attribute=="Record" and x.data_model=="measure", vis_attrs_specs)) n_record = len(record_attrs) for clause in vis_attrs_specs: if (clause.attribute!="Record"): if (clause.data_model == 'dimension'): n_dim += 1 if (clause.data_model == 'measure'): n_msr += 1 n_filter = len(filter_specs) attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) # Line/Bar Chart #print("r:", n_record, "m:", n_msr, "d:",n_dim) if (n_dim == 1 and (n_msr==0 or n_msr==1)): if (v_size<2): return -1 if (n_filter == 0): return unevenness(vis, ldf, measure_lst, dimension_lst) elif(n_filter==1): return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif (n_dim == 0 and n_msr == 1): if (v_size<2): return -1 if (n_filter == 0): v = vis.data["Number of Records"] return skewness(v) elif (n_filter == 1): return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") # Scatter Plot elif (n_dim == 0 and n_msr == 2): if (v_size<2): return -1 if (n_filter==1): v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size/v_size else: sig = 1 return sig * monotonicity(vis,attr_specs) # Scatterplot colored by Dimension elif (n_dim == 1 and n_msr == 2): if (v_size<5): return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if (C<40): return 1/C else: return -1 # Scatterplot colored by dimension elif (n_dim== 1 and n_msr == 2): return 0.2 # Scatterplot colored by measure elif (n_msr == 3): return 0.1 # colored line and barchart cases elif ((vis.mark == "line" or vis.mark == "bar") and n_dim == 2): return 0.2 # Default else: return -1
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) record_attrs = list( filter( lambda x: x.attribute == "Record" and x.data_model == "measure", vis_attrs_specs, )) n_record = len(record_attrs) for clause in vis_attrs_specs: if clause.attribute != "Record": if clause.data_model == "dimension": n_dim += 1 if clause.data_model == "measure": n_msr += 1 n_filter = len(filter_specs) attr_specs = [ clause for clause in vis_attrs_specs if clause.attribute != "Record" ] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif n_dim == 0 and n_msr == 1: if v_size < 2: return -1 if n_filter == 0 and "Number of Records" in vis.data: if "Number of Records" in vis.data: v = vis.data["Number of Records"] return skewness(v) elif n_filter == 1 and "Number of Records" in vis.data: return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 # Scatter Plot elif n_dim == 0 and n_msr == 2: if v_size < 10: return -1 if vis.mark == "heatmap": return weighted_correlation(vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]) if n_filter == 1: v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(vis, attr_specs) # Scatterplot colored by Dimension elif n_dim == 1 and n_msr == 2: if v_size < 10: return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if C < 40: return 1 / C else: return -1 # Scatterplot colored by dimension elif n_dim == 1 and n_msr == 2: return 0.2 # Scatterplot colored by measure elif n_msr == 3: return 0.1 # colored line and barchart cases elif vis.mark == "line" and n_dim == 2: return 0.15 # for colored bar chart, scoring based on Chi-square test for independence score. # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users elif vis.mark == "bar" and n_dim == 2: from scipy.stats import chi2_contingency measure_column = vis.get_attr_by_data_model("measure")[0].attribute dimension_columns = vis.get_attr_by_data_model("dimension") groupby_column = dimension_columns[0].attribute color_column = dimension_columns[1].attribute contingency_table = [] groupby_cardinality = ldf.cardinality[groupby_column] groupby_unique_vals = ldf.unique_values[groupby_column] for c in range(0, groupby_cardinality): contingency_table.append( vis.data[vis.data[groupby_column] == groupby_unique_vals[c]][measure_column]) score = 0.12 # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in # a category having no counts try: color_cardinality = ldf.cardinality[color_column] # scale down score based on number of categories chi2_score = chi2_contingency(contingency_table)[0] * 0.9**( color_cardinality + groupby_cardinality) score = min(0.10, chi2_score) except ValueError: pass return score # Default else: return -1
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the vis. The interestingness metric is dependent on the vis type. Parameters ---------- vis : Vis ldf : LuxDataFrame Returns ------- int Interestingness Score """ if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") try: filter_specs = utils.get_filter_specs(vis._inferred_intent) vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) n_dim = vis._ndim n_msr = vis._nmsr n_filter = len(filter_specs) attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) if ( n_dim == 1 and (n_msr == 0 or n_msr == 1) and ldf.current_vis is not None and vis.get_attr_by_channel("y")[0].data_type == "quantitative" and len(ldf.current_vis) == 1 and ldf.current_vis[0].mark == "line" and len(get_filter_specs(ldf.intent)) > 0 ): query_vc = VisList(ldf.current_vis, ldf) query_vis = query_vc[0] preprocess(query_vis) preprocess(vis) return 1 - euclidean_dist(query_vis, vis) # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif n_dim == 0 and n_msr == 1: if v_size < 2: return -1 if n_filter == 0 and "Number of Records" in vis.data: if "Number of Records" in vis.data: v = vis.data["Number of Records"] return skewness(v) elif n_filter == 1 and "Number of Records" in vis.data: return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 # Scatter Plot elif n_dim == 0 and n_msr == 2: if v_size < 10: return -1 if vis.mark == "heatmap": return weighted_correlation( vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"] ) if n_filter == 1: v_filter_size = get_filtered_size(filter_specs, vis.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(vis, attr_specs) # Scatterplot colored by Dimension elif n_dim == 1 and n_msr == 2: if v_size < 10: return -1 color_attr = vis.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if C < 40: return 1 / C else: return -1 # Scatterplot colored by dimension elif n_dim == 1 and n_msr == 2: return 0.2 # Scatterplot colored by measure elif n_msr == 3: return 0.1 # colored line and barchart cases elif vis.mark == "line" and n_dim == 2: return 0.15 # for colored bar chart, scoring based on Chi-square test for independence score. # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users elif vis.mark == "bar" and n_dim == 2: from scipy.stats import chi2_contingency measure_column = vis.get_attr_by_data_model("measure")[0].attribute dimension_columns = vis.get_attr_by_data_model("dimension") groupby_column = dimension_columns[0].attribute color_column = dimension_columns[1].attribute contingency_tbl = pd.crosstab( vis.data[groupby_column], vis.data[color_column], values=vis.data[measure_column], aggfunc=sum, ) try: color_cardinality = ldf.cardinality[color_column] groupby_cardinality = ldf.cardinality[groupby_column] # scale down score based on number of categories chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** ( color_cardinality + groupby_cardinality ) score = min(0.10, chi2_score) except (ValueError, KeyError): # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts score = -1 return score # Default else: return -1 except: if lux.config.interestingness_fallback: # Supress interestingness related issues warnings.warn(f"An error occurred when computing interestingness for: {vis}") return -1 else: raise
def add_filter(ldf): """ Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. """ filters = utils.get_filter_specs(ldf._intent) filter_values = [] output = [] # if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical # variable column_spec = utils.get_attrs_specs(ldf.current_vis[0].intent) column_spec_attr = list(map(lambda x: x.attribute, column_spec)) if len(filters) == 1: # get unique values for all categorical values specified and creates corresponding filters fltr = filters[0] if ldf.data_type[fltr.attribute] == "nominal": recommendation = { "action": "Filter", "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an " f"alternative value.", "long_description": f"Swap out the filter value for {fltr.attribute} to other possible values, while " f"keeping all else the same. Visualizations are ranked based on interestingness", } unique_values = ldf.unique_values[fltr.attribute] filter_values.append(fltr.value) # creates vis with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Clause(attribute=fltr.attribute, value=val) new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) elif ldf.data_type[fltr.attribute] == "quantitative": recommendation = { "action": "Filter", "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an " f"alternative inequality operation.", "long_description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an " f"alternative inequality operation.", } # Create vis with complementary filter operations # NOTE: This section of code has been modified to allow for the rendering of multiple vis for op in get_complementary_ops(fltr.filter_op): new_spec = column_spec.copy() new_filter = lux.Clause( attribute=fltr.attribute, filter_op=op, value=fltr.value, ) new_spec.append(new_filter) temp_vis = Vis(new_spec, score=1) output.append(temp_vis) # if no existing filters, create filters using unique values from all categorical variables in the dataset else: intended_attrs = ", ".join( [ str(clause.attribute) for clause in ldf._intent if clause.value == "" and clause.attribute != "Record" ] ) recommendation = { "action": "Filter", "description": f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent.", "long_description": f"Adding any filter while keeping the attributes on the x and y axes fixed. " f"Visualizations are ranked based on interestingness", } categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if 1 < ldf.cardinality[col] < 30 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for val in unique_values: new_spec = column_spec.copy() new_filter = lux.Clause(attribute=cat, filter_op="=", value=val) new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) if ( ldf.current_vis is not None and len(ldf.current_vis) == 1 and ldf.current_vis[0].mark == "line" and len(get_filter_specs(ldf.intent)) > 0 ): recommendation = { "action": "Similarity", "description": "Show other charts that are visually similar to the Current vis.", "long_description": "Show other charts that are visually similar to the Current vis.", } last = get_filter_specs(ldf.intent)[-1] output = ldf.intent.copy()[0:-1] # array of possible values for attribute arr = ldf[last.attribute].unique().tolist() output.append(lux.Clause(last.attribute, last.attribute, arr)) vlist = lux.vis.VisList.VisList(output, ldf) vlist_copy = lux.vis.VisList.VisList(output, ldf) for i in range(len(vlist_copy)): vlist[i].score = interestingness(vlist_copy[i], ldf) vlist.sort() vlist = vlist.showK() if recommendation["action"] == "Similarity": recommendation["collection"] = vlist[1:] else: recommendation["collection"] = vlist return recommendation
def interestingness(view: View, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the view. The interestingness metric is dependent on the view type. Parameters ---------- view : View ldf : LuxDataFrame Returns ------- int Interestingness Score """ if view.data is None: raise Exception( "View.data needs to be populated before interestingness can be computed. Run Executor.execute(view,ldf)." ) n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(view.spec_lst) view_attrs_specs = utils.get_attrs_specs(view.spec_lst) for spec in view_attrs_specs: if (spec.attribute != "Record"): if (spec.data_model == 'dimension'): n_dim += 1 if (spec.data_model == 'measure'): n_msr += 1 n_filter = len(filter_specs) attr_specs = [ spec for spec in view_attrs_specs if spec.attribute != "Record" ] dimension_lst = view.get_attr_by_data_model("dimension") measure_lst = view.get_attr_by_data_model("measure") # Bar Chart if (n_dim == 1 and (n_msr == 0 or n_msr == 1)): if (n_filter == 0): return unevenness(view, ldf, measure_lst, dimension_lst) elif (n_filter == 1): return deviation_from_overall(view, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif (n_dim == 0 and n_msr == 1): if (n_filter == 0): v = view.data["Count of Records"] return skewness(v) elif (n_filter == 1): return deviation_from_overall(view, ldf, filter_specs, "Count of Records") # Scatter Plot elif (n_dim == 0 and n_msr == 2): if (n_filter == 1): v_filter_size = get_filtered_size(filter_specs, view.data) v_size = len(view.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(view, attr_specs) # Scatterplot colored by Dimension elif (n_dim == 1 and n_msr == 2): color_attr = view.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if (C < 40): return 1 / C else: return -1 # Scatterplot colored by dimension elif (n_dim == 1 and n_msr == 2): return 0.2 # Scatterplot colored by measure elif (n_msr == 3): return 0.1 # Default else: return -1
def filter(ldf): #for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() ''' Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified intent. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. ''' filters = utils.get_filter_specs(ldf.intent) filter_values = [] output = [] #if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical variable column_spec = utils.get_attrs_specs(ldf.current_vis[0]._inferred_intent) column_spec_attr = map(lambda x: x.attribute,column_spec) if len(filters) == 1: #get unique values for all categorical values specified and creates corresponding filters fltr = filters[0] unique_values = ldf.unique_values[fltr.attribute] filter_values.append(fltr.value) #creates views with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Clause(attribute = fltr.attribute, value = val) new_spec.append(new_filter) temp_view = Vis(new_spec) output.append(temp_view) recommendation = {"action":"Filter", "description":f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value."} else: #if no existing filters, create filters using unique values from all categorical variables in the dataset intended_attrs = '<b>'+', '.join([clause.attribute for clause in ldf.intent if clause.value=='' and clause.attribute!="Record"])+'</b>' recommendation = {"action":"Filter", "description":f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent."} categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if ldf.cardinality[col]<30 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for i in range(0, len(unique_values)): new_spec = column_spec.copy() new_filter = lux.Clause(attribute=cat, filter_op="=",value=unique_values[i]) new_spec.append(new_filter) temp_view = Vis(new_spec) output.append(temp_view) vc = lux.vis.VisList.VisList(output,ldf) for view in vc: view.score = interestingness(view,ldf) vc = vc.topK(15) recommendation["collection"] = vc #for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed filter action in {toc - tic:0.4f} seconds") return recommendation
def filter(ldf): #for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() ''' Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified context. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. ''' recommendation = { "action": "Filter", "description": "Shows possible visualizations when filtered by categorical variables in the dataset." } filters = utils.get_filter_specs(ldf.context) filter_values = [] output = [] #if Row is specified, create visualizations where data is filtered by all values of the Row's categorical variable column_spec = utils.get_attrs_specs(ldf.current_view[0].spec_lst) column_spec_attr = map(lambda x: x.attribute, column_spec) if len(filters) > 0: #get unique values for all categorical values specified and creates corresponding filters for row in filters: unique_values = ldf.unique_values[row.attribute] filter_values.append(row.value) #creates views with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Spec(attribute=row.attribute, value=val) new_spec.append(new_filter) temp_view = View(new_spec) output.append(temp_view) else: #if no existing filters, create filters using unique values from all categorical variables in the dataset categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if ldf.cardinality[col] < 40 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for i in range(0, len(unique_values)): new_spec = column_spec.copy() new_filter = lux.Spec(attribute=cat, filter_op="=", value=unique_values[i]) new_spec.append(new_filter) temp_view = View(new_spec) output.append(temp_view) vc = lux.view.ViewCollection.ViewCollection(output) vc = vc.load(ldf) for view in vc: view.score = interestingness(view, ldf) vc = vc.topK(15) recommendation["collection"] = vc #for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed filter action in {toc - tic:0.4f} seconds") return recommendation