def test_sort_bar(): from lux.compiler.Compiler import Compiler from lux.view.View import View df = pd.read_csv("lux/data/car.csv") view = View([ lux.Spec(attribute="Acceleration", data_model="measure", data_type="quantitative"), lux.Spec(attribute="Origin", data_model="dimension", data_type="nominal") ]) Compiler.determine_encoding(df, view) assert view.mark == "bar" assert view.spec_lst[1].sort == '' df = pd.read_csv("lux/data/car.csv") view = View([ lux.Spec(attribute="Acceleration", data_model="measure", data_type="quantitative"), lux.Spec(attribute="Name", data_model="dimension", data_type="nominal") ]) Compiler.determine_encoding(df, view) assert view.mark == "bar" assert view.spec_lst[1].sort == 'ascending'
def test_vary_filter_val(): df = pd.read_csv("lux/data/olympic.csv") view = View(["Height", "SportType=Ball"]) view = view.load(df) df.set_context_as_view(view) df.show_more() assert len( df.recommendation["Filter"]) == len(df["SportType"].unique()) - 1
def enforceSpecifiedChannel(view: View, autoChannel: Dict[str, str]): """ Enforces that the channels specified in the View by users overrides the showMe autoChannels. Parameters ---------- view : lux.view.View Input View without channel specification. autoChannel : Dict[str,str] Key-value pair in the form [channel: attributeName] specifying the showMe recommended channel location. Returns ------- view : lux.view.View View with channel specification combining both original and autoChannel specification. Raises ------ ValueError Ensures no more than one attribute is placed in the same channel. """ resultDict = { } # result of enforcing specified channel will be stored in resultDict specifiedDict = { } # specifiedDict={"x":[],"y":[list of Dobj with y specified as channel]} # create a dictionary of specified channels in the given dobj for val in autoChannel.keys(): specifiedDict[val] = view.getAttrByChannel(val) resultDict[val] = "" # for every element, replace with what's in specifiedDict if specified for sVal, sAttr in specifiedDict.items(): if (len(sAttr) == 1): # if specified in dobj # remove the specified channel from autoChannel (matching by value, since channel key may not be same) for i in list(autoChannel.keys()): if ( (autoChannel[i].attribute == sAttr[0].attribute) and (autoChannel[i].channel == sVal) ): # need to ensure that the channel is the same (edge case when duplicate Cols with same attribute name) autoChannel.pop(i) break sAttr[0].channel = sVal resultDict[sVal] = sAttr[0] elif (len(sAttr) > 1): raise ValueError( "There should not be more than one attribute specified in the same channel." ) # For the leftover channels that are still unspecified in resultDict, # and the leftovers in the autoChannel specification, # step through them together and fill it automatically. leftover_channels = list( filter(lambda x: resultDict[x] == '', resultDict)) for leftover_channel, leftover_encoding in zip(leftover_channels, autoChannel.values()): leftover_encoding.channel = leftover_channel resultDict[leftover_channel] = leftover_encoding view.specLst = list(resultDict.values()) return view
def execute_binning(view: View, ldf: LuxDataFrame): import numpy as np import pandas as pd bin_attribute = list(filter(lambda x: x.bin_size != 0, view.spec_lst))[0] num_bins = bin_attribute.bin_size attr_min = min(ldf.unique_values[bin_attribute.attribute]) attr_max = max(ldf.unique_values[bin_attribute.attribute]) attr_type = type(ldf.unique_values[bin_attribute.attribute][0]) #need to calculate the bin edges before querying for the relevant data bin_width = (attr_max - attr_min) / num_bins upper_edges = [] for e in range(1, num_bins): curr_edge = attr_min + e * bin_width if attr_type == int: upper_edges.append(str(math.ceil(curr_edge))) else: upper_edges.append(str(curr_edge)) upper_edges = ",".join(upper_edges) view_filter, filter_vars = SQLExecutor.execute_filter(view) bin_count_query = "SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket({}, '{}') FROM {}) as Buckets GROUP BY width_bucket ORDER BY width_bucket".format( bin_attribute.attribute, '{' + upper_edges + '}', ldf.table_name) bin_count_data = pd.read_sql(bin_count_query, ldf.SQLconnection) #counts,binEdges = np.histogram(ldf[bin_attribute.attribute],bins=bin_attribute.bin_size) #binEdges of size N+1, so need to compute binCenter as the bin location upper_edges = [float(i) for i in upper_edges.split(",")] if attr_type == int: bin_centers = np.array( [math.ceil((attr_min + attr_min + bin_width) / 2)]) else: bin_centers = np.array([(attr_min + attr_min + bin_width) / 2]) bin_centers = np.append( bin_centers, np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0)) if attr_type == int: bin_centers = np.append( bin_centers, math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2)) else: bin_centers = np.append( bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2) if len(bin_centers) > len(bin_count_data): bucket_lables = bin_count_data['width_bucket'].unique() for i in range(0, len(bin_centers)): if i not in bucket_lables: bin_count_data = bin_count_data.append( pd.DataFrame([[i, 0]], columns=bin_count_data.columns)) view.data = pd.DataFrame( np.array([bin_centers, list(bin_count_data['count'])]).T, columns=[bin_attribute.attribute, "Count of Records (binned)"]) view.data = utils.pandas_to_lux(view.data)
def execute_aggregate(view: View, ldf: LuxDataFrame): import pandas as pd x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] groupby_attr = "" measure_attr = "" if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if (measure_attr != ""): #barchart case, need count data for each group if (measure_attr.attribute == "Record"): where_clause, filterVars = SQLExecutor.execute_filter(view) count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format( groupby_attr.attribute, groupby_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(count_query, ldf.SQLconnection) view.data = view.data.rename(columns={"count": "Record"}) view.data = utils.pandas_to_lux(view.data) else: where_clause, filterVars = SQLExecutor.execute_filter(view) if agg_func == "mean": mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "sum": mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "max": mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data)
def test_refresh_inplace(): df = pd.DataFrame({ 'date': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01'], 'value': [10.5, 15.2, 20.3, 25.2] }) assert df.data_type['nominal'][0] == 'date' from lux.view.View import View view = View(["date", "value"]) view.load(df) df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d") assert df.data_type['temporal'][0] == 'date'
def execute_binning(view: View): ''' Binning of data points for generating histograms Parameters ---------- view: lux.View lux.View object that represents a visualization ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with specified context. Returns ------- None ''' import numpy as np import pandas as pd # is this import going to be conflicting with LuxDf? bin_attribute = list(filter(lambda x: x.bin_size != 0, view.spec_lst))[0] #TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong. counts, bin_edges = np.histogram(view.data[bin_attribute.attribute], bins=bin_attribute.bin_size) #bin_edges of size N+1, so need to compute bin_center as the bin location bin_center = np.mean(np.vstack([bin_edges[0:-1], bin_edges[1:]]), axis=0) # TODO: Should view.data be a LuxDataFrame or a Pandas DataFrame? view.data = pd.DataFrame( np.array([bin_center, counts]).T, columns=[bin_attribute.attribute, "Count of Records"])
def executeAggregate(view: View): ''' Aggregate data points on an axis for bar or line charts Parameters ---------- view: lux.View lux.View object that represents a visualization ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with specified context. Returns ------- None ''' import numpy as np xAttr = view.getAttrByChannel("x")[0] yAttr = view.getAttrByChannel("y")[0] groupbyAttr = "" measureAttr = "" if (yAttr.aggregation != ""): groupbyAttr = xAttr measureAttr = yAttr aggFunc = yAttr.aggregation if (xAttr.aggregation != ""): groupbyAttr = yAttr measureAttr = xAttr aggFunc = xAttr.aggregation allAttrVals = view.data.uniqueValues[groupbyAttr.attribute] if (measureAttr != ""): if (measureAttr.attribute == "Record"): view.data = view.data.reset_index() view.data = view.data.groupby( groupbyAttr.attribute).count().reset_index() view.data = view.data.rename(columns={"index": "Record"}) view.data = view.data[[groupbyAttr.attribute, "Record"]] else: groupbyResult = view.data.groupby(groupbyAttr.attribute) view.data = groupbyResult.agg(aggFunc).reset_index() resultVals = list(view.data[groupbyAttr.attribute]) if (len(resultVals) != len(allAttrVals)): # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints for vals in allAttrVals: if (vals not in resultVals): view.data.loc[len(view.data)] = [vals, 0] assert len(list(view.data[groupbyAttr.attribute])) == len( allAttrVals ), f"Aggregated data missing values compared to original range of values of `{groupbyAttr.attribute}`." view.data = view.data.sort_values(by=groupbyAttr.attribute, ascending=True) view.data = view.data.reset_index() view.data = view.data.drop(columns="index")
def execute_filter(view: View): assert view.data is not None, "execute_filter assumes input view.data is populated (if not, populate with LuxDataFrame values)" filters = utils.get_filter_specs(view.spec_lst) if (filters): # TODO: Need to handle OR logic for filter in filters: view.data = PandasExecutor.apply_filter( view.data, filter.attribute, filter.filter_op, filter.value)
def combine(colAttrs, accum): last = (len(colAttrs) == 1) n = len(colAttrs[0]) for i in range(n): columnList = copy.deepcopy(accum + [colAttrs[0][i]]) if last: if len( filters ) > 0: # if we have filters, generate combinations for each row. for row in filters: specLst = copy.deepcopy(columnList + [row]) view = View( specLst, title= f"{row.attribute} {row.filterOp} {row.value}") collection.append(view) else: view = View(columnList) collection.append(view) else: combine(colAttrs[1:], columnList)
def interestingness(view: View, ldf: LuxDataFrame) -> int: """ Compute the interestingness score of the view. The interestingness metric is dependent on the view type. Parameters ---------- view : View ldf : LuxDataFrame Returns ------- int Interestingness Score """ if view.data is None: raise Exception( "View.data needs to be populated before interestingness can be computed. Run Executor.execute(view,ldf)." ) n_dim = 0 n_msr = 0 filter_specs = utils.get_filter_specs(view.spec_lst) view_attrs_specs = utils.get_attrs_specs(view.spec_lst) for spec in view_attrs_specs: if (spec.attribute != "Record"): if (spec.data_model == 'dimension'): n_dim += 1 if (spec.data_model == 'measure'): n_msr += 1 n_filter = len(filter_specs) attr_specs = [ spec for spec in view_attrs_specs if spec.attribute != "Record" ] dimension_lst = view.get_attr_by_data_model("dimension") measure_lst = view.get_attr_by_data_model("measure") # Bar Chart if (n_dim == 1 and (n_msr == 0 or n_msr == 1)): if (n_filter == 0): return unevenness(view, ldf, measure_lst, dimension_lst) elif (n_filter == 1): return deviation_from_overall(view, ldf, filter_specs, measure_lst[0].attribute) # Histogram elif (n_dim == 0 and n_msr == 1): if (n_filter == 0): v = view.data["Count of Records"] return skewness(v) elif (n_filter == 1): return deviation_from_overall(view, ldf, filter_specs, "Count of Records") # Scatter Plot elif (n_dim == 0 and n_msr == 2): if (n_filter == 1): v_filter_size = get_filtered_size(filter_specs, view.data) v_size = len(view.data) sig = v_filter_size / v_size else: sig = 1 return sig * monotonicity(view, attr_specs) # Scatterplot colored by Dimension elif (n_dim == 1 and n_msr == 2): color_attr = view.get_attr_by_channel("color")[0].attribute C = ldf.cardinality[color_attr] if (C < 40): return 1 / C else: return -1 # Scatterplot colored by dimension elif (n_dim == 1 and n_msr == 2): return 0.2 # Scatterplot colored by measure elif (n_msr == 3): return 0.1 # Default else: return -1
def test_remove(): from lux.view.View import View df = pd.read_csv("lux/data/car.csv") view = View(["Horsepower", "Horsepower"]) view.load(df) view.remove_column_from_spec_new("Horsepower", remove_first=False) assert (view.spec_lst == []), "Remove all instances of Horsepower" df = pd.read_csv("lux/data/car.csv") view = View(["Horsepower", "Horsepower"]) view.load(df) view.remove_column_from_spec_new("Horsepower", remove_first=True) assert (len(view.spec_lst) == 1), "Remove only 1 instances of Horsepower" assert (view.spec_lst[0].attribute == "Horsepower" ), "Remove only 1 instances of Horsepower"
def filter(ldf): #for benchmarking if ldf.toggle_benchmarking == True: tic = time.perf_counter() ''' Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified context. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Filter action. ''' recommendation = { "action": "Filter", "description": "Shows possible visualizations when filtered by categorical variables in the dataset." } filters = utils.get_filter_specs(ldf.context) filter_values = [] output = [] #if Row is specified, create visualizations where data is filtered by all values of the Row's categorical variable column_spec = utils.get_attrs_specs(ldf.current_view[0].spec_lst) column_spec_attr = map(lambda x: x.attribute, column_spec) if len(filters) > 0: #get unique values for all categorical values specified and creates corresponding filters for row in filters: unique_values = ldf.unique_values[row.attribute] filter_values.append(row.value) #creates views with new filters for val in unique_values: if val not in filter_values: new_spec = column_spec.copy() new_filter = lux.Spec(attribute=row.attribute, value=val) new_spec.append(new_filter) temp_view = View(new_spec) output.append(temp_view) else: #if no existing filters, create filters using unique values from all categorical variables in the dataset categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column if ldf.cardinality[col] < 40 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] for i in range(0, len(unique_values)): new_spec = column_spec.copy() new_filter = lux.Spec(attribute=cat, filter_op="=", value=unique_values[i]) new_spec.append(new_filter) temp_view = View(new_spec) output.append(temp_view) vc = lux.view.ViewCollection.ViewCollection(output) vc = vc.load(ldf) for view in vc: view.score = interestingness(view, ldf) vc = vc.topK(15) recommendation["collection"] = vc #for benchmarking if ldf.toggle_benchmarking == True: toc = time.perf_counter() print(f"Performed filter action in {toc - tic:0.4f} seconds") return recommendation
def generalize(ldf): #for benchmarking if ldf.toggleBenchmarking == True: tic = time.perf_counter() ''' Generates all possible visualizations when one attribute or filter from the current view is removed. Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified context. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Generalize action. ''' # takes in a dataObject and generates a list of new dataObjects, each with a single measure from the original object removed # --> return list of dataObjects with corresponding interestingness scores recommendation = {"action":"Generalize", "description":"Remove one attribute or filter to observe a more general trend."} output = [] excludedColumns = [] columnSpec = list(filter(lambda x: x.value=="" and x.attribute!="Record", ldf.context)) rowSpecs = utils.getFilterSpecs(ldf.context) # if we do no have enough column attributes or too many, return no views. if(len(columnSpec)<2 or len(columnSpec)>4): recommendation["collection"] = [] return recommendation for spec in columnSpec: columns = spec.attribute if type(columns) == list: for column in columns: if column not in excludedColumns: tempView = View(ldf.context) tempView.removeColumnFromSpecNew(column) excludedColumns.append(column) output.append(tempView) elif type(columns) == str: if columns not in excludedColumns: tempView = View(ldf.context) tempView.removeColumnFromSpecNew(columns) excludedColumns.append(columns) output.append(tempView) for i, spec in enumerate(rowSpecs): newSpec = ldf.context.copy() newSpec.pop(i) tempView = View(newSpec) output.append(tempView) vc = lux.view.ViewCollection.ViewCollection(output) vc = vc.load(ldf) recommendation["collection"] = vc for view in vc: view.score = interestingness(view,ldf) vc.sort(removeInvalid=True) #for benchmarking if ldf.toggleBenchmarking == True: toc = time.perf_counter() print(f"Performed generalize action in {toc - tic:0.4f} seconds") return recommendation
def determineEncoding(ldf: LuxDataFrame, view: View): ''' Populates View with the appropriate mark type and channel information based on ShowMe logic Currently support up to 3 dimensions or measures Parameters ---------- ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with underspecified context view : lux.view.View Returns ------- None Notes ----- Implementing automatic encoding from Tableau's VizQL Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007). Show Me: Automatic presentation for visual analysis. IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144. https://doi.org/10.1109/TVCG.2007.70594 ''' # Count number of measures and dimensions Ndim = 0 Nmsr = 0 filters = [] for spec in view.specLst: if (spec.value == ""): if (spec.dataModel == "dimension"): Ndim += 1 elif (spec.dataModel == "measure" and spec.attribute != "Record"): Nmsr += 1 else: # preserve to add back to specLst later filters.append(spec) # Helper function (TODO: Move this into utils) def lineOrBar(ldf, dimension, measure): dimType = dimension.dataType # If no aggregation function is specified, then default as average if (measure.aggregation == ""): measure.aggregation = "mean" if (dimType == "temporal" or dimType == "oridinal"): return "line", {"x": dimension, "y": measure} else: # unordered categorical # if cardinality large than 5 then sort bars if ldf.cardinality[dimension.attribute] > 5: dimension.sort = "ascending" return "bar", {"x": measure, "y": dimension} # ShowMe logic + additional heuristics #countCol = Spec( attribute="count()", dataModel="measure") countCol = Spec(attribute="Record", aggregation="count", dataModel="measure", dataType="quantitative") # xAttr = view.getAttrByChannel("x") # not used as of now # yAttr = view.getAttrByChannel("y") # zAttr = view.getAttrByChannel("z") autoChannel = {} if (Ndim == 0 and Nmsr == 1): # Histogram with Count measure = view.getAttrByDataModel("measure", excludeRecord=True)[0] if (len(view.getAttrByAttrName("Record")) < 0): view.specLst.append(countCol) # If no bin specified, then default as 10 if (measure.binSize == 0): measure.binSize = 10 autoChannel = {"x": measure, "y": countCol} view.xMinMax = ldf.xMinMax view.mark = "histogram" elif (Ndim == 1 and (Nmsr == 0 or Nmsr == 1)): # Line or Bar Chart if (Nmsr == 0): view.specLst.append(countCol) dimension = view.getAttrByDataModel("dimension")[0] measure = view.getAttrByDataModel("measure")[0] view.mark, autoChannel = lineOrBar(ldf, dimension, measure) elif (Ndim == 2 and (Nmsr == 0 or Nmsr == 1)): # Line or Bar chart broken down by the dimension dimensions = view.getAttrByDataModel("dimension") d1 = dimensions[0] d2 = dimensions[1] if (ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]): # d1.channel = "color" view.removeColumnFromSpec(d1.attribute) dimension = d2 colorAttr = d1 else: if (d1.attribute == d2.attribute): view.specLst.pop( 0 ) # if same attribute then removeColumnFromSpec will remove both dims, we only want to remove one else: view.removeColumnFromSpec(d2.attribute) dimension = d1 colorAttr = d2 # Colored Bar/Line chart with Count as default measure if (Nmsr == 0): view.specLst.append(countCol) measure = view.getAttrByDataModel("measure")[0] view.mark, autoChannel = lineOrBar(ldf, dimension, measure) autoChannel["color"] = colorAttr elif (Ndim == 0 and Nmsr == 2): # Scatterplot view.xMinMax = ldf.xMinMax view.yMinMax = ldf.yMinMax view.mark = "scatter" autoChannel = {"x": view.specLst[0], "y": view.specLst[1]} elif (Ndim == 1 and Nmsr == 2): # Scatterplot broken down by the dimension measure = view.getAttrByDataModel("measure") m1 = measure[0] m2 = measure[1] colorAttr = view.getAttrByDataModel("dimension")[0] view.removeColumnFromSpec(colorAttr) view.xMinMax = ldf.xMinMax view.yMinMax = ldf.yMinMax view.mark = "scatter" autoChannel = {"x": m1, "y": m2, "color": colorAttr} elif (Ndim == 0 and Nmsr == 3): # Scatterplot with color view.xMinMax = ldf.xMinMax view.yMinMax = ldf.yMinMax view.mark = "scatter" autoChannel = { "x": view.specLst[0], "y": view.specLst[1], "color": view.specLst[2] } if (autoChannel != {}): view = Compiler.enforceSpecifiedChannel(view, autoChannel) view.specLst.extend(filters) # add back the preserved filters