def execute_binning(view: Vis, ldf: LuxDataFrame): import numpy as np import pandas as pd bin_attribute = list( filter(lambda x: x.bin_size != 0, view._inferred_intent))[0] num_bins = bin_attribute.bin_size attr_min = min(ldf.unique_values[bin_attribute.attribute]) attr_max = max(ldf.unique_values[bin_attribute.attribute]) attr_type = type(ldf.unique_values[bin_attribute.attribute][0]) #need to calculate the bin edges before querying for the relevant data bin_width = (attr_max - attr_min) / num_bins upper_edges = [] for e in range(1, num_bins): curr_edge = attr_min + e * bin_width if attr_type == int: upper_edges.append(str(math.ceil(curr_edge))) else: upper_edges.append(str(curr_edge)) upper_edges = ",".join(upper_edges) view_filter, filter_vars = SQLExecutor.execute_filter(view) bin_count_query = "SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket({}, '{}') FROM {}) as Buckets GROUP BY width_bucket ORDER BY width_bucket".format( bin_attribute.attribute, '{' + upper_edges + '}', ldf.table_name) bin_count_data = pd.read_sql(bin_count_query, ldf.SQLconnection) #counts,binEdges = np.histogram(ldf[bin_attribute.attribute],bins=bin_attribute.bin_size) #binEdges of size N+1, so need to compute binCenter as the bin location upper_edges = [float(i) for i in upper_edges.split(",")] if attr_type == int: bin_centers = np.array( [math.ceil((attr_min + attr_min + bin_width) / 2)]) else: bin_centers = np.array([(attr_min + attr_min + bin_width) / 2]) bin_centers = np.append( bin_centers, np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0)) if attr_type == int: bin_centers = np.append( bin_centers, math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2)) else: bin_centers = np.append( bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2) if len(bin_centers) > len(bin_count_data): bucket_lables = bin_count_data['width_bucket'].unique() for i in range(0, len(bin_centers)): if i not in bucket_lables: bin_count_data = bin_count_data.append( pd.DataFrame([[i, 0]], columns=bin_count_data.columns)) view.data = pd.DataFrame( np.array([bin_centers, list(bin_count_data['count'])]).T, columns=[bin_attribute.attribute, "Number of Records"]) view.data = utils.pandas_to_lux(view.data)
def execute_aggregate(view: Vis, ldf: LuxDataFrame): import pandas as pd x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] groupby_attr = "" measure_attr = "" if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if (measure_attr != ""): #barchart case, need count data for each group if (measure_attr.attribute == "Record"): where_clause, filterVars = SQLExecutor.execute_filter(view) count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format( groupby_attr.attribute, groupby_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(count_query, ldf.SQLconnection) view.data = view.data.rename(columns={"count": "Record"}) view.data = utils.pandas_to_lux(view.data) else: where_clause, filterVars = SQLExecutor.execute_filter(view) if agg_func == "mean": mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "sum": mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "max": mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) #pad empty categories with 0 counts after filter is applied all_attr_vals = ldf.unique_values[groupby_attr.attribute] result_vals = list(view.data[groupby_attr.attribute]) if (len(result_vals) != len(all_attr_vals)): # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints for vals in all_attr_vals: if (vals not in result_vals): view.data.loc[len(view.data)] = [ vals ] + [0] * (len(view.data.columns) - 1)
def execute_binning(view: Vis): ''' Binning of data points for generating histograms Parameters ---------- view: lux.Vis lux.Vis object that represents a visualization ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with specified intent. Returns ------- None ''' import numpy as np import pandas as pd # is this import going to be conflicting with LuxDf? bin_attribute = list( filter(lambda x: x.bin_size != 0, view._inferred_intent))[0] #TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong. counts, bin_edges = np.histogram(view.data[bin_attribute.attribute], bins=bin_attribute.bin_size) #bin_edges of size N+1, so need to compute bin_center as the bin location bin_center = np.mean(np.vstack([bin_edges[0:-1], bin_edges[1:]]), axis=0) # TODO: Should view.data be a LuxDataFrame or a Pandas DataFrame? view.data = pd.DataFrame( np.array([bin_center, counts]).T, columns=[bin_attribute.attribute, "Number of Records"])
def execute_filter(view: Vis): assert view.data is not None, "execute_filter assumes input view.data is populated (if not, populate with LuxDataFrame values)" filters = utils.get_filter_specs(view._inferred_intent) if (filters): # TODO: Need to handle OR logic for filter in filters: view.data = PandasExecutor.apply_filter( view.data, filter.attribute, filter.filter_op, filter.value) return True else: return False
def test_vis_private_properties(): from lux.vis.Vis import Vis df = pd.read_csv("lux/data/car.csv") vis = Vis(["Horsepower", "Weight"], df) vis._repr_html_() assert isinstance(vis.data, lux.core.frame.LuxDataFrame) with pytest.raises(AttributeError, match="can't set attribute"): vis.data = "some val" assert isinstance(vis.code, dict) with pytest.raises(AttributeError, match="can't set attribute"): vis.code = "some val" assert isinstance(vis.min_max, dict) with pytest.raises(AttributeError, match="can't set attribute"): vis.min_max = "some val" assert vis.mark == "scatter" with pytest.raises(AttributeError, match="can't set attribute"): vis.mark = "some val"
def test_vis_private_properties(global_var): from lux.vis.Vis import Vis df = pytest.car_df vis = Vis(["Horsepower", "Weight"], df) vis._ipython_display_() assert isinstance(vis.data, lux.core.frame.LuxDataFrame) with pytest.raises(AttributeError, match="can't set attribute"): vis.data = "some val" assert isinstance(vis.code, dict) with pytest.raises(AttributeError, match="can't set attribute"): vis.code = "some val" assert isinstance(vis.min_max, dict) with pytest.raises(AttributeError, match="can't set attribute"): vis.min_max = "some val" assert vis.mark == "scatter" with pytest.raises(AttributeError, match="can't set attribute"): vis.mark = "some val"
def execute_aggregate(view: Vis, isFiltered=True): ''' Aggregate data points on an axis for bar or line charts Parameters ---------- view: lux.Vis lux.Vis object that represents a visualization ldf : lux.luxDataFrame.LuxDataFrame LuxDataFrame with specified intent. Returns ------- None ''' import numpy as np import pandas as pd import time x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] has_color = False groupby_attr = "" measure_attr = "" if (x_attr.aggregation is None or y_attr.aggregation is None): return if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation #checks if color is specified in the Vis if len(view.get_attr_by_channel("color")) == 1: color_attr = view.get_attr_by_channel("color")[0] color_attr_vals = view.data.unique_values[color_attr.attribute] color_cardinality = len(color_attr_vals) #NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable---------------- has_color = True else: color_cardinality = 1 all_attr_vals = view.data.unique_values[groupby_attr.attribute] if (measure_attr != ""): if (measure_attr.attribute == "Record"): view.data = view.data.reset_index() #if color is specified, need to group by groupby_attr and color_attr if has_color: view.data = view.data.groupby( [groupby_attr.attribute, color_attr.attribute]).count().reset_index() view.data = view.data.rename(columns={"index": "Record"}) view.data = view.data[[ groupby_attr.attribute, color_attr.attribute, "Record" ]] else: view.data = view.data.groupby( groupby_attr.attribute).count().reset_index() view.data = view.data.rename(columns={"index": "Record"}) view.data = view.data[[groupby_attr.attribute, "Record"]] else: #if color is specified, need to group by groupby_attr and color_attr if has_color: groupby_result = view.data.groupby( [groupby_attr.attribute, color_attr.attribute]) else: groupby_result = view.data.groupby(groupby_attr.attribute) view.data = groupby_result.agg(agg_func).reset_index() result_vals = list(view.data[groupby_attr.attribute]) #create existing group by attribute combinations if color is specified #this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them if has_color: res_color_combi_vals = [] result_color_vals = list(view.data[color_attr.attribute]) for i in range(0, len(result_vals)): res_color_combi_vals.append( [result_vals[i], result_color_vals[i]]) if (len(result_vals) != len(all_attr_vals) * color_cardinality and (isFiltered or has_color)): ####### ORIGINAL # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints # for vals in all_attr_vals: # if (vals not in result_vals): # view.data.loc[len(view.data)] = [vals]+[0]*(len(view.data.columns)-1) ####### SOLUTION 1 - INCOMPLETE SOLUTION, FAILS ON NONETYPE # start = time.time() # list_diff = np.setdiff1d(all_attr_vals, result_vals) # print(time.time() - start, 's') # df = pd.DataFrame({view.data.columns[1]: list_diff}) # for col in view.data.columns[1:]: # df[col] = 0 # view.data = view.data.append(df) ####### SOLUTION 2 # columns = view.data.columns # df = pd.DataFrame({columns[0]: all_attr_vals}) # for col in columns[1:]: # df[col] = 0 # view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['_left', '_right']) # for col in columns[1:]: # view.data[col + '_left'] = view.data[col + '_left'].fillna(0) # view.data[col + '_right'] = view.data[col + '_right'].fillna(0) # view.data[col] = view.data[col + '_left'] + view.data[col + '_right'] # del view.data[col + '_left'] # del view.data[col + '_right'] ####### SOLUTION 3 # columns = view.data.columns # df = pd.DataFrame({columns[0]: all_attr_vals}) # for col in columns[1:]: # df[col] = 0 # view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['', '_right']) # for col in columns[1:]: # view.data[col] = view.data[col].fillna(0) # del view.data[col + '_right'] ####### SOLUTION 4 columns = view.data.columns if has_color: df = pd.DataFrame({ columns[0]: all_attr_vals * color_cardinality, columns[1]: pd.Series(color_attr_vals).repeat(len(all_attr_vals)) }) view.data = view.data.merge(df, on=[columns[0], columns[1]], how='right', suffixes=['', '_right']) for col in columns[2:]: view.data[col] = view.data[col].fillna(0) assert len( list(view.data[groupby_attr.attribute]) ) == len(all_attr_vals) * len( color_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." # for vals in all_attr_vals: # for cvals in color_attr_vals: # temp_combi = [vals, cvals] # if (temp_combi not in res_color_combi_vals): # view.data.loc[len(view.data)] = [vals]+[cvals]+[0]*(len(view.data.columns)-2) else: df = pd.DataFrame({columns[0]: all_attr_vals}) view.data = view.data.merge(df, on=columns[0], how='right', suffixes=['', '_right']) for col in columns[1:]: view.data[col] = view.data[col].fillna(0) assert len(list(view.data[groupby_attr.attribute])) == len( all_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." view.data = view.data.sort_values(by=groupby_attr.attribute, ascending=True) view.data = view.data.reset_index() view.data = view.data.drop(columns="index")