def execute_aggregate(view: Vis, ldf: LuxDataFrame): import pandas as pd x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] groupby_attr = "" measure_attr = "" if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if (measure_attr != ""): #barchart case, need count data for each group if (measure_attr.attribute == "Record"): where_clause, filterVars = SQLExecutor.execute_filter(view) count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format( groupby_attr.attribute, groupby_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(count_query, ldf.SQLconnection) view.data = view.data.rename(columns={"count": "Record"}) view.data = utils.pandas_to_lux(view.data) else: where_clause, filterVars = SQLExecutor.execute_filter(view) if agg_func == "mean": mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "sum": mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "max": mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) #pad empty categories with 0 counts after filter is applied all_attr_vals = ldf.unique_values[groupby_attr.attribute] result_vals = list(view.data[groupby_attr.attribute]) if (len(result_vals) != len(all_attr_vals)): # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints for vals in all_attr_vals: if (vals not in result_vals): view.data.loc[len(view.data)] = [ vals ] + [0] * (len(view.data.columns) - 1)
def execute_aggregate(view: View, ldf: LuxDataFrame): import pandas as pd x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] groupby_attr = "" measure_attr = "" if (y_attr.aggregation != ""): groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if (x_attr.aggregation != ""): groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if (measure_attr != ""): #barchart case, need count data for each group if (measure_attr.attribute == "Record"): where_clause, filterVars = SQLExecutor.execute_filter(view) count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format( groupby_attr.attribute, groupby_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(count_query, ldf.SQLconnection) view.data = view.data.rename(columns={"count": "Record"}) view.data = utils.pandas_to_lux(view.data) else: where_clause, filterVars = SQLExecutor.execute_filter(view) if agg_func == "mean": mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "sum": mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data) if agg_func == "max": mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, ldf.table_name, where_clause, groupby_attr.attribute) view.data = pd.read_sql(mean_query, ldf.SQLconnection) view.data = utils.pandas_to_lux(view.data)
def execute_binning(vis: Vis, ldf: LuxDataFrame): import numpy as np import pandas as pd bin_attribute = list(filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0] if not math.isnan(vis.data.min_max[bin_attribute.attribute][0]) and math.isnan( vis.data.min_max[bin_attribute.attribute][1] ): num_bins = bin_attribute.bin_size attr_min = min(ldf.unique_values[bin_attribute.attribute]) attr_max = max(ldf.unique_values[bin_attribute.attribute]) attr_type = type(ldf.unique_values[bin_attribute.attribute][0]) # need to calculate the bin edges before querying for the relevant data bin_width = (attr_max - attr_min) / num_bins upper_edges = [] for e in range(1, num_bins): curr_edge = attr_min + e * bin_width if attr_type == int: upper_edges.append(str(math.ceil(curr_edge))) else: upper_edges.append(str(curr_edge)) upper_edges = ",".join(upper_edges) vis_filter, filter_vars = SQLExecutor.execute_filter(vis) bin_count_query = f"SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket({bin_attribute.attribute}, '{{{upper_edges}}}') FROM {ldf.table_name}) as Buckets GROUP BY width_bucket ORDER BY width_bucket" bin_count_data = pd.read_sql(bin_count_query, ldf.SQLconnection) # counts,binEdges = np.histogram(ldf[bin_attribute.attribute],bins=bin_attribute.bin_size) # binEdges of size N+1, so need to compute binCenter as the bin location upper_edges = [float(i) for i in upper_edges.split(",")] if attr_type == int: bin_centers = np.array([math.ceil((attr_min + attr_min + bin_width) / 2)]) else: bin_centers = np.array([(attr_min + attr_min + bin_width) / 2]) bin_centers = np.append( bin_centers, np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0), ) if attr_type == int: bin_centers = np.append( bin_centers, math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2), ) else: bin_centers = np.append(bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2) if len(bin_centers) > len(bin_count_data): bucket_lables = bin_count_data["width_bucket"].unique() for i in range(0, len(bin_centers)): if i not in bucket_lables: bin_count_data = bin_count_data.append( pd.DataFrame([[i, 0]], columns=bin_count_data.columns) ) vis._vis_data = pd.DataFrame( np.array([bin_centers, list(bin_count_data["count"])]).T, columns=[bin_attribute.attribute, "Number of Records"], ) vis._vis_data = utils.pandas_to_lux(vis.data)
def execute_scatter(view: Vis, tbl: LuxSQLTable): """ Given a scatterplot vis and a Lux Dataframe, fetch the data required to render the vis. 1) Generate WHERE clause for the SQL query 2) Check number of datapoints to be included in the query 3) If the number of datapoints exceeds 10000, perform a random sample from the original data 4) Query datapoints needed for the scatterplot visualization 5) return a DataFrame with relevant results Parameters ---------- vislist: list[lux.Vis] vis list that contains lux.Vis objects for visualization. tbl : lux.core.frame LuxSQLTable with specified intent. Returns ------- None """ attributes = set([]) for clause in view._inferred_intent: if clause.attribute: if clause.attribute != "Record": attributes.add(clause.attribute) where_clause, filterVars = SQLExecutor.execute_filter(view) length_query = pandas.read_sql( "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause), lux.config.SQLconnection, ) def add_quotes(var_name): return '"' + var_name + '"' required_variables = attributes | set(filterVars) required_variables = map(add_quotes, required_variables) required_variables = ",".join(required_variables) row_count = list( pandas.read_sql( f"SELECT COUNT(*) FROM {tbl.table_name} {where_clause}", lux.config.SQLconnection, )["count"] )[0] if row_count > lux.config.sampling_cap: query = f"SELECT {required_variables} FROM {tbl.table_name} {where_clause} ORDER BY random() LIMIT 10000" else: query = "SELECT {} FROM {} {}".format(required_variables, tbl.table_name, where_clause) data = pandas.read_sql(query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(data) # view._vis_data.length = list(length_query["length"])[0] tbl._message.add_unique( f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.", priority=98, )
def execute(vislist: VisList, ldf: LuxDataFrame): import pandas as pd """ Given a VisList, fetch the data required to render the vis 1) Apply filters 2) Retreive relevant attribute 3) return a DataFrame with relevant results """ for vis in vislist: # Select relevant data based on attribute information attributes = set([]) for clause in vis._inferred_intent: if clause.attribute: if clause.attribute == "Record": attributes.add(clause.attribute) # else: attributes.add(clause.attribute) if vis.mark not in ["bar", "line", "histogram"]: where_clause, filterVars = SQLExecutor.execute_filter(vis) required_variables = attributes | set(filterVars) required_variables = ",".join(required_variables) row_count = list( pd.read_sql( "SELECT COUNT(*) FROM {} {}".format( ldf.table_name, where_clause), ldf.SQLconnection, )["count"])[0] if row_count > 10000: query = "SELECT {} FROM {} {} ORDER BY random() LIMIT 10000".format( required_variables, ldf.table_name, where_clause) else: query = "SELECT {} FROM {} {}".format( required_variables, ldf.table_name, where_clause) data = pd.read_sql(query, ldf.SQLconnection) vis._vis_data = utils.pandas_to_lux(data) if vis.mark == "bar" or vis.mark == "line": SQLExecutor.execute_aggregate(vis, ldf) elif vis.mark == "histogram": SQLExecutor.execute_binning(vis, ldf)
def execute(view_collection: ViewCollection, ldf: LuxDataFrame): import pandas as pd ''' Given a ViewCollection, fetch the data required to render the view 1) Apply filters 2) Retreive relevant attribute 3) return a DataFrame with relevant results ''' for view in view_collection: print(view, utils.get_filter_specs(view.spec_lst)) # Select relevant data based on attribute information attributes = set([]) for spec in view.spec_lst: if (spec.attribute): if (spec.attribute == "Record"): attributes.add(spec.attribute) #else: attributes.add(spec.attribute) if view.mark not in ["bar", "line", "histogram"]: where_clause, filterVars = SQLExecutor.execute_filter(view) required_variables = attributes | set(filterVars) required_variables = ",".join(required_variables) row_count = list( pd.read_sql( "SELECT COUNT(*) FROM {} {}".format( ldf.table_name, where_clause), ldf.SQLconnection)['count'])[0] if row_count > 10000: query = "SELECT {} FROM {} {} ORDER BY random() LIMIT 10000".format( required_variables, ldf.table_name, where_clause) else: query = "SELECT {} FROM {} {}".format( required_variables, ldf.table_name, where_clause) data = pd.read_sql(query, ldf.SQLconnection) view.data = utils.pandas_to_lux(data) if (view.mark == "bar" or view.mark == "line"): SQLExecutor.execute_aggregate(view, ldf) elif (view.mark == "histogram"): SQLExecutor.execute_binning(view, ldf)
def execute_2D_binning(view: Vis, tbl: LuxSQLTable): import numpy as np x_attribute = list(filter(lambda x: x.channel == "x", view._inferred_intent))[0] y_attribute = list(filter(lambda x: x.channel == "y", view._inferred_intent))[0] num_bins = lux.config.heatmap_bin_size x_attr_min = tbl._min_max[x_attribute.attribute][0] x_attr_max = tbl._min_max[x_attribute.attribute][1] x_attr_type = type(tbl.unique_values[x_attribute.attribute][0]) y_attr_min = tbl._min_max[y_attribute.attribute][0] y_attr_max = tbl._min_max[y_attribute.attribute][1] y_attr_type = type(tbl.unique_values[y_attribute.attribute][0]) # get filters if available where_clause, filterVars = SQLExecutor.execute_filter(view) # need to calculate the bin edges before querying for the relevant data x_bin_width = (x_attr_max - x_attr_min) / num_bins y_bin_width = (y_attr_max - y_attr_min) / num_bins x_upper_edges = [] y_upper_edges = [] for e in range(0, num_bins): x_curr_edge = x_attr_min + e * x_bin_width y_curr_edge = y_attr_min + e * y_bin_width # get upper edges for x attribute bins if x_attr_type == int: x_upper_edges.append(math.ceil(x_curr_edge)) else: x_upper_edges.append(x_curr_edge) # get upper edges for y attribute bins if y_attr_type == int: y_upper_edges.append(str(math.ceil(y_curr_edge))) else: y_upper_edges.append(str(y_curr_edge)) x_upper_edges_string = [str(int) for int in x_upper_edges] x_upper_edges_string = ",".join(x_upper_edges_string) y_upper_edges_string = ",".join(y_upper_edges) bin_count_query = "SELECT width_bucket1, width_bucket2, count(*) FROM (SELECT width_bucket(CAST (\"{}\" AS FLOAT), '{}') as width_bucket1, width_bucket(CAST (\"{}\" AS FLOAT), '{}') as width_bucket2 FROM {} {}) as foo GROUP BY width_bucket1, width_bucket2".format( x_attribute.attribute, "{" + x_upper_edges_string + "}", y_attribute.attribute, "{" + y_upper_edges_string + "}", tbl.table_name, where_clause, ) # data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) # data = data[data["width_bucket1"] != num_bins - 1] # data = data[data["width_bucket2"] != num_bins - 1] if len(data) > 0: data["xBinStart"] = data.apply( lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]) - x_bin_width, axis=1 ) data["xBinEnd"] = data.apply( lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]), axis=1 ) data["yBinStart"] = data.apply( lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]) - y_bin_width, axis=1 ) data["yBinEnd"] = data.apply( lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]), axis=1 ) view._vis_data = utils.pandas_to_lux(data)
def execute_binning(view: Vis, tbl: LuxSQLTable): """ Binning of data points for generating histograms Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization tbl : lux.core.frame LuxSQLTable with specified intent. Returns ------- None """ import numpy as np bin_attribute = list(filter(lambda x: x.bin_size != 0, view._inferred_intent))[0] num_bins = bin_attribute.bin_size attr_min = tbl._min_max[bin_attribute.attribute][0] attr_max = tbl._min_max[bin_attribute.attribute][1] attr_type = type(tbl.unique_values[bin_attribute.attribute][0]) # get filters if available where_clause, filterVars = SQLExecutor.execute_filter(view) length_query = pandas.read_sql( "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause), lux.config.SQLconnection, ) # need to calculate the bin edges before querying for the relevant data bin_width = (attr_max - attr_min) / num_bins upper_edges = [] for e in range(1, num_bins): curr_edge = attr_min + e * bin_width if attr_type == int: upper_edges.append(str(math.ceil(curr_edge))) else: upper_edges.append(str(curr_edge)) upper_edges = ",".join(upper_edges) view_filter, filter_vars = SQLExecutor.execute_filter(view) bin_count_query = "SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket(CAST (\"{}\" AS FLOAT), '{}') FROM {} {}) as Buckets GROUP BY width_bucket ORDER BY width_bucket".format( bin_attribute.attribute, "{" + upper_edges + "}", tbl.table_name, where_clause, ) bin_count_data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) if not bin_count_data["width_bucket"].isnull().values.any(): # np.histogram breaks if data contain NaN # counts,binEdges = np.histogram(tbl[bin_attribute.attribute],bins=bin_attribute.bin_size) # binEdges of size N+1, so need to compute binCenter as the bin location upper_edges = [float(i) for i in upper_edges.split(",")] if attr_type == int: bin_centers = np.array([math.ceil((attr_min + attr_min + bin_width) / 2)]) else: bin_centers = np.array([(attr_min + attr_min + bin_width) / 2]) bin_centers = np.append( bin_centers, np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0), ) if attr_type == int: bin_centers = np.append( bin_centers, math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2), ) else: bin_centers = np.append(bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2) if len(bin_centers) > len(bin_count_data): bucket_lables = bin_count_data["width_bucket"].unique() for i in range(0, len(bin_centers)): if i not in bucket_lables: bin_count_data = bin_count_data.append( pandas.DataFrame([[i, 0]], columns=bin_count_data.columns) ) view._vis_data = pandas.DataFrame( np.array([bin_centers, list(bin_count_data["count"])]).T, columns=[bin_attribute.attribute, "Number of Records"], ) view._vis_data = utils.pandas_to_lux(view.data)
def execute_aggregate(view: Vis, tbl: LuxSQLTable, isFiltered=True): """ Aggregate data points on an axis for bar or line charts Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization tbl : lux.core.frame LuxSQLTable with specified intent. isFiltered: boolean boolean that represents whether a vis has had a filter applied to its data Returns ------- None """ x_attr = view.get_attr_by_channel("x")[0] y_attr = view.get_attr_by_channel("y")[0] has_color = False groupby_attr = "" measure_attr = "" if x_attr.aggregation is None or y_attr.aggregation is None: return if y_attr.aggregation != "": groupby_attr = x_attr measure_attr = y_attr agg_func = y_attr.aggregation if x_attr.aggregation != "": groupby_attr = y_attr measure_attr = x_attr agg_func = x_attr.aggregation if groupby_attr.attribute in tbl.unique_values.keys(): attr_unique_vals = tbl.unique_values[groupby_attr.attribute] # checks if color is specified in the Vis if len(view.get_attr_by_channel("color")) == 1: color_attr = view.get_attr_by_channel("color")[0] color_attr_vals = tbl.unique_values[color_attr.attribute] color_cardinality = len(color_attr_vals) # NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable---------------- has_color = True else: color_cardinality = 1 if measure_attr != "": # barchart case, need count data for each group if measure_attr.attribute == "Record": where_clause, filterVars = SQLExecutor.execute_filter(view) length_query = pandas.read_sql( "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause), lux.config.SQLconnection, ) # generates query for colored barchart case if has_color: count_query = 'SELECT "{}", "{}", COUNT("{}") FROM {} {} GROUP BY "{}", "{}"'.format( groupby_attr.attribute, color_attr.attribute, groupby_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, color_attr.attribute, ) view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection) view._vis_data = view._vis_data.rename(columns={"count": "Record"}) view._vis_data = utils.pandas_to_lux(view._vis_data) # generates query for normal barchart case else: count_query = 'SELECT "{}", COUNT("{}") FROM {} {} GROUP BY "{}"'.format( groupby_attr.attribute, groupby_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, ) view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection) view._vis_data = view._vis_data.rename(columns={"count": "Record"}) view._vis_data = utils.pandas_to_lux(view._vis_data) # view._vis_data.length = list(length_query["length"])[0] # aggregate barchart case, need aggregate data (mean, sum, max) for each group else: where_clause, filterVars = SQLExecutor.execute_filter(view) length_query = pandas.read_sql( "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause), lux.config.SQLconnection, ) # generates query for colored barchart case if has_color: if agg_func == "mean": agg_query = ( 'SELECT "{}", "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, color_attr.attribute, ) ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "sum": agg_query = ( 'SELECT "{}", "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, color_attr.attribute, ) ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "max": agg_query = ( 'SELECT "{}", "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format( groupby_attr.attribute, color_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, color_attr.attribute, ) ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) # generates query for normal barchart case else: if agg_func == "mean": agg_query = 'SELECT "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "sum": agg_query = 'SELECT "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) if agg_func == "max": agg_query = 'SELECT "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}"'.format( groupby_attr.attribute, measure_attr.attribute, measure_attr.attribute, tbl.table_name, where_clause, groupby_attr.attribute, ) view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(view._vis_data) result_vals = list(view._vis_data[groupby_attr.attribute]) # create existing group by attribute combinations if color is specified # this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them if has_color: res_color_combi_vals = [] result_color_vals = list(view._vis_data[color_attr.attribute]) for i in range(0, len(result_vals)): res_color_combi_vals.append([result_vals[i], result_color_vals[i]]) # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints if isFiltered or has_color and attr_unique_vals: N_unique_vals = len(attr_unique_vals) if len(result_vals) != N_unique_vals * color_cardinality: columns = view._vis_data.columns if has_color: df = pandas.DataFrame( { columns[0]: attr_unique_vals * color_cardinality, columns[1]: pandas.Series(color_attr_vals).repeat(N_unique_vals), } ) view._vis_data = view._vis_data.merge( df, on=[columns[0], columns[1]], how="right", suffixes=["", "_right"], ) for col in columns[2:]: view._vis_data[col] = view._vis_data[col].fillna(0) # Triggers __setitem__ assert len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals * len( color_attr_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`." view._vis_data = view._vis_data.iloc[ :, :3 ] # Keep only the three relevant columns not the *_right columns resulting from merge else: df = pandas.DataFrame({columns[0]: attr_unique_vals}) view._vis_data = view._vis_data.merge( df, on=columns[0], how="right", suffixes=["", "_right"] ) for col in columns[1:]: view._vis_data[col] = view._vis_data[col].fillna(0) assert ( len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." view._vis_data = view._vis_data.sort_values(by=groupby_attr.attribute, ascending=True) view._vis_data = view._vis_data.reset_index() view._vis_data = view._vis_data.drop(columns="index")
def execute_2D_binning(view: Vis, tbl: LuxSQLTable): import numpy as np x_attribute = list(filter(lambda x: x.channel == "x", view._inferred_intent))[0] y_attribute = list(filter(lambda x: x.channel == "y", view._inferred_intent))[0] num_bins = lux.config.heatmap_bin_size x_attr_min = tbl._min_max[x_attribute.attribute][0] x_attr_max = tbl._min_max[x_attribute.attribute][1] x_attr_type = type(tbl.unique_values[x_attribute.attribute][0]) y_attr_min = tbl._min_max[y_attribute.attribute][0] y_attr_max = tbl._min_max[y_attribute.attribute][1] y_attr_type = type(tbl.unique_values[y_attribute.attribute][0]) # get filters if available where_clause, filterVars = SQLExecutor.execute_filter(view) # need to calculate the bin edges before querying for the relevant data x_bin_width = (x_attr_max - x_attr_min) / num_bins y_bin_width = (y_attr_max - y_attr_min) / num_bins x_upper_edges = [] y_upper_edges = [] for e in range(0, num_bins): x_curr_edge = x_attr_min + e * x_bin_width y_curr_edge = y_attr_min + e * y_bin_width # get upper edges for x attribute bins if x_attr_type == int: x_upper_edges.append(math.ceil(x_curr_edge)) else: x_upper_edges.append(x_curr_edge) # get upper edges for y attribute bins if y_attr_type == int: y_upper_edges.append(str(math.ceil(y_curr_edge))) else: y_upper_edges.append(str(y_curr_edge)) x_upper_edges_string = [str(int) for int in x_upper_edges] x_upper_edges_string = ",".join(x_upper_edges_string) y_upper_edges_string = ",".join(y_upper_edges) if "cases" in lux.config.query_templates['histogram_counts']: x_bucket_edges = [x_attr_min] y_bucket_edges = [y_attr_min] for e in range(1, num_bins): x_curr_edge = x_attr_min + e * x_bin_width x_bucket_edges.append(str(x_curr_edge)) y_curr_edge = y_attr_min + e * y_bin_width y_bucket_edges.append(str(y_curr_edge)) x_bucket_edges.append(x_attr_max) y_bucket_edges.append(y_attr_max) when_line = "WHEN {column} BETWEEN {lower_edge} AND {upper_edge} THEN {label}" x_when_lines = "CASE " y_when_lines = "CASE " for i in range(1, len(x_bucket_edges)): x_when_lines = x_when_lines + when_line.format(column = x_attribute.attribute, lower_edge = x_bucket_edges[i-1], upper_edge = x_bucket_edges[i], label = str(i-1)) + " " y_when_lines = y_when_lines + when_line.format(column = y_attribute.attribute, lower_edge = y_bucket_edges[i-1], upper_edge = y_bucket_edges[i], label = str(i-1)) + " " x_when_lines = x_when_lines + "end" y_when_lines = y_when_lines + "end" #hist_query = "select width_bucket, count(width_bucket) as count from (select ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets group by width_bucket order by width_bucket" bin_count_query = lux.config.query_templates['heatmap_counts'].format(bucket_cases1 = x_when_lines, bucket_cases2 = y_when_lines, table_name = tbl.table_name, where_clause = where_clause) else: bin_count_query = lux.config.query_templates['heatmap_counts'].format(x_attribute = x_attribute.attribute,x_upper_edges_string = "{" + x_upper_edges_string + "}",y_attribute = y_attribute.attribute,y_upper_edges_string = "{" + y_upper_edges_string + "}",table_name = tbl.table_name,where_clause = where_clause,) # data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) assert((len(data.columns) == 3) & (set(['width_bucket1', 'width_bucket2', 'count']).issubset(data.columns))) # data = data[data["width_bucket1"] != num_bins - 1] # data = data[data["width_bucket2"] != num_bins - 1] if len(data) > 0: data["xBinStart"] = data.apply(lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]) - x_bin_width, axis=1) data["xBinEnd"] = data.apply(lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]), axis=1) data["yBinStart"] = data.apply(lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]) - y_bin_width, axis=1) data["yBinEnd"] = data.apply(lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]), axis=1) view._vis_data = utils.pandas_to_lux(data)
def execute_binning(view: Vis, tbl: LuxSQLTable): """ Binning of data points for generating histograms Parameters ---------- vis: lux.Vis lux.Vis object that represents a visualization tbl : lux.core.frame LuxSQLTable with specified intent. Returns ------- None """ import numpy as np bin_attribute = list(filter(lambda x: x.bin_size != 0, view._inferred_intent))[0] num_bins = bin_attribute.bin_size attr_min = tbl._min_max[bin_attribute.attribute][0] attr_max = tbl._min_max[bin_attribute.attribute][1] attr_type = type(tbl.unique_values[bin_attribute.attribute][0]) # get filters if available where_clause, filterVars = SQLExecutor.execute_filter(view) length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,) bin_width = (attr_max - attr_min) / num_bins upper_edges = [] for e in range(1, num_bins): curr_edge = attr_min + e * bin_width if attr_type == int: upper_edges.append(str(math.ceil(curr_edge))) else: upper_edges.append(str(curr_edge)) upper_edges = ",".join(upper_edges) view_filter, filter_vars = SQLExecutor.execute_filter(view) #handling for non postgres case if "cases" in lux.config.query_templates['histogram_counts']: bucket_edges = [attr_min] for e in range(1, num_bins): curr_edge = attr_min + e * bin_width bucket_edges.append(str(curr_edge)) bucket_edges.append(attr_max) when_line = "WHEN {column} BETWEEN {lower_edge} AND {upper_edge} THEN {label}" when_lines = "CASE " for i in range(1, len(bucket_edges)): when_lines = when_lines + when_line.format(column = bin_attribute.attribute, lower_edge = bucket_edges[i-1], upper_edge = bucket_edges[i], label = str(i-1)) + " " when_lines = when_lines + "end" #hist_query = "select width_bucket, count(width_bucket) as count from (select ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets group by width_bucket order by width_bucket" bin_count_query = lux.config.query_templates['histogram_counts'].format(bucket_cases = when_lines, table_name = tbl.table_name, where_clause = where_clause) # need to calculate the bin edges before querying for the relevant data else: bin_count_query = lux.config.query_templates['histogram_counts'].format(bin_attribute = bin_attribute.attribute,upper_edges = "{" + upper_edges + "}",table_name = tbl.table_name,where_clause = where_clause,) bin_count_data = pandas.read_sql(bin_count_query, lux.config.SQLconnection) assert((len(bin_count_data.columns) ==2) & (set(['width_bucket', 'count']).issubset(bin_count_data.columns))) if not bin_count_data["width_bucket"].isnull().values.any(): # np.histogram breaks if data contain NaN # counts,binEdges = np.histogram(tbl[bin_attribute.attribute],bins=bin_attribute.bin_size) # binEdges of size N+1, so need to compute binCenter as the bin location upper_edges = [float(i) for i in upper_edges.split(",")] if attr_type == int: bin_centers = np.array([math.ceil((attr_min + attr_min + bin_width) / 2)]) else: bin_centers = np.array([(attr_min + attr_min + bin_width) / 2]) bin_centers = np.append(bin_centers,np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0),) if attr_type == int: bin_centers = np.append(bin_centers,math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2),) else: bin_centers = np.append(bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2) if len(bin_centers) > len(bin_count_data): bucket_lables = bin_count_data["width_bucket"].unique() for i in range(0, len(bin_centers)): if i not in bucket_lables: bin_count_data = bin_count_data.append(pandas.DataFrame([[i, 0]], columns=bin_count_data.columns)) view._vis_data = pandas.DataFrame(np.array([bin_centers, list(bin_count_data["count"])]).T,columns=[bin_attribute.attribute, "Number of Records"],) view._vis_data = utils.pandas_to_lux(view.data)