Esempio n. 1
0
def test_inequalityfilter():
    df = pd.read_csv("lux/data/car.csv")
    vis = Vis([
        lux.Clause(attribute="Horsepower", filter_op=">", value=50),
        lux.Clause(attribute="MilesPerGal"),
    ])
    vis._vis_data = df
    PandasExecutor.execute_filter(vis)
    assert len(df) > len(vis.data)
    assert len(vis.data) == 386

    intent = [
        lux.Clause(attribute="Horsepower", filter_op="<=", value=100),
        lux.Clause(attribute="MilesPerGal"),
    ]
    vis = Vis(intent, df)
    vis._vis_data = df
    PandasExecutor.execute_filter(vis)
    assert len(vis.data) == len(df[df["Horsepower"] <= 100]) == 242

    # Test end-to-end
    PandasExecutor.execute([vis], df)
    Nbins = list(filter(lambda x: x.bin_size != 0,
                        vis._inferred_intent))[0].bin_size
    assert len(vis.data) == Nbins
Esempio n. 2
0
def test_inequalityfilter():
    connection = psycopg2.connect(
        "host=localhost dbname=postgres user=postgres password=lux")
    sql_df = lux.LuxSQLTable()
    lux.config.set_SQL_connection(connection)
    sql_df.set_SQL_table("car")

    vis = Vis([
        lux.Clause(attribute="Horsepower", filter_op=">", value=50),
        lux.Clause(attribute="MilesPerGal"),
    ])
    vis._vis_data = sql_df
    filter_output = SQLExecutor.execute_filter(vis)
    assert filter_output[
        0] == 'WHERE "Horsepower" > \'50\' AND "MilesPerGal" IS NOT NULL'
    assert filter_output[1] == ["Horsepower"]

    intent = [
        lux.Clause(attribute="Horsepower", filter_op="<=", value=100),
        lux.Clause(attribute="MilesPerGal"),
    ]
    vis = Vis(intent, sql_df)
    vis._vis_data = sql_df
    filter_output = SQLExecutor.execute_filter(vis)
    assert filter_output[
        0] == 'WHERE "Horsepower" <= \'100\' AND "MilesPerGal" IS NOT NULL'
    assert filter_output[1] == ["Horsepower"]
Esempio n. 3
0
    def execute_binning(vis: Vis, ldf: LuxDataFrame):
        import numpy as np
        import pandas as pd

        bin_attribute = list(filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0]
        if not math.isnan(vis.data.min_max[bin_attribute.attribute][0]) and math.isnan(
            vis.data.min_max[bin_attribute.attribute][1]
        ):
            num_bins = bin_attribute.bin_size
            attr_min = min(ldf.unique_values[bin_attribute.attribute])
            attr_max = max(ldf.unique_values[bin_attribute.attribute])
            attr_type = type(ldf.unique_values[bin_attribute.attribute][0])

            # need to calculate the bin edges before querying for the relevant data
            bin_width = (attr_max - attr_min) / num_bins
            upper_edges = []
            for e in range(1, num_bins):
                curr_edge = attr_min + e * bin_width
                if attr_type == int:
                    upper_edges.append(str(math.ceil(curr_edge)))
                else:
                    upper_edges.append(str(curr_edge))
            upper_edges = ",".join(upper_edges)
            vis_filter, filter_vars = SQLExecutor.execute_filter(vis)
            bin_count_query = f"SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket({bin_attribute.attribute}, '{{{upper_edges}}}') FROM {ldf.table_name}) as Buckets GROUP BY width_bucket ORDER BY width_bucket"
            bin_count_data = pd.read_sql(bin_count_query, ldf.SQLconnection)

            # counts,binEdges = np.histogram(ldf[bin_attribute.attribute],bins=bin_attribute.bin_size)
            # binEdges of size N+1, so need to compute binCenter as the bin location
            upper_edges = [float(i) for i in upper_edges.split(",")]
            if attr_type == int:
                bin_centers = np.array([math.ceil((attr_min + attr_min + bin_width) / 2)])
            else:
                bin_centers = np.array([(attr_min + attr_min + bin_width) / 2])
            bin_centers = np.append(
                bin_centers,
                np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0),
            )
            if attr_type == int:
                bin_centers = np.append(
                    bin_centers,
                    math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2),
                )
            else:
                bin_centers = np.append(bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2)

            if len(bin_centers) > len(bin_count_data):
                bucket_lables = bin_count_data["width_bucket"].unique()
                for i in range(0, len(bin_centers)):
                    if i not in bucket_lables:
                        bin_count_data = bin_count_data.append(
                            pd.DataFrame([[i, 0]], columns=bin_count_data.columns)
                        )
            vis._vis_data = pd.DataFrame(
                np.array([bin_centers, list(bin_count_data["count"])]).T,
                columns=[bin_attribute.attribute, "Number of Records"],
            )
            vis._vis_data = utils.pandas_to_lux(vis.data)
Esempio n. 4
0
    def execute_aggregate(vis: Vis, ldf: LuxDataFrame):
        import pandas as pd
        x_attr = vis.get_attr_by_channel("x")[0]
        y_attr = vis.get_attr_by_channel("y")[0]
        groupby_attr = ""
        measure_attr = ""
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation

        if (measure_attr != ""):
            #barchart case, need count data for each group
            if (measure_attr.attribute == "Record"):
                where_clause, filterVars = SQLExecutor.execute_filter(vis)
                count_query = "SELECT {}, COUNT({}) FROM {} {} GROUP BY {}".format(
                    groupby_attr.attribute, groupby_attr.attribute,
                    ldf.table_name, where_clause, groupby_attr.attribute)
                vis._vis_data = pd.read_sql(count_query, ldf.SQLconnection)
                vis._vis_data = vis.data.rename(columns={"count": "Record"})
                vis._vis_data = utils.pandas_to_lux(vis.data)

            else:
                where_clause, filterVars = SQLExecutor.execute_filter(vis)
                if agg_func == "mean":
                    mean_query = "SELECT {}, AVG({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    vis._vis_data = pd.read_sql(mean_query, ldf.SQLconnection)
                    vis._vis_data = utils.pandas_to_lux(vis.data)
                if agg_func == "sum":
                    mean_query = "SELECT {}, SUM({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    vis._vis_data = pd.read_sql(mean_query, ldf.SQLconnection)
                    vis._vis_data = utils.pandas_to_lux(vis.data)
                if agg_func == "max":
                    mean_query = "SELECT {}, MAX({}) as {} FROM {} {} GROUP BY {}".format(
                        groupby_attr.attribute, measure_attr.attribute,
                        measure_attr.attribute, ldf.table_name, where_clause,
                        groupby_attr.attribute)
                    vis._vis_data = pd.read_sql(mean_query, ldf.SQLconnection)
                    vis._vis_data = utils.pandas_to_lux(vis.data)

            #pad empty categories with 0 counts after filter is applied
            all_attr_vals = ldf.unique_values[groupby_attr.attribute]
            result_vals = list(vis.data[groupby_attr.attribute])
            if (len(result_vals) != len(all_attr_vals)):
                # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
                for vals in all_attr_vals:
                    if (vals not in result_vals):
                        vis.data.loc[len(
                            vis.data)] = [vals
                                          ] + [0] * (len(vis.data.columns) - 1)
Esempio n. 5
0
    def execute_filter(vis: Vis) -> bool:
        """
        Apply a Vis's filter to vis.data

        Parameters
        ----------
        vis : Vis

        Returns
        -------
        bool
            Boolean flag indicating if any filter was applied
        """
        assert (
            vis.data is not None
        ), "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)"
        filters = utils.get_filter_specs(vis._inferred_intent)

        if filters:
            # TODO: Need to handle OR logic
            for filter in filters:
                vis._vis_data = PandasExecutor.apply_filter(
                    vis.data, filter.attribute, filter.filter_op, filter.value
                )
            return True
        else:
            return False
Esempio n. 6
0
    def execute_binning(ldf: LuxDataFrame, vis: Vis):
        """
        Binning of data points for generating histograms

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.core.frame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        """
        import numpy as np

        bin_attribute = list(filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0]
        bin_attr = bin_attribute.attribute
        series = vis.data[bin_attr]

        if series.hasnans:
            ldf._message.add_unique(
                f"The column <code>{bin_attr}</code> contains missing values, not shown in the displayed histogram.",
                priority=100,
            )
            series = series.dropna()
        if pd.api.types.is_object_dtype(series):
            series = series.astype("float", errors="ignore")

        counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
        # bin_edges of size N+1, so need to compute bin_start as the bin location
        bin_start = bin_edges[0:-1]
        binned_result = np.array([bin_start, counts]).T
        vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])
Esempio n. 7
0
    def execute_binning(vis: Vis):
        '''
        Binning of data points for generating histograms

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.core.frame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        '''
        import numpy as np
        bin_attribute = list(
            filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0]
        if not np.isnan(vis.data[bin_attribute.attribute]).all():
            series = vis.data[bin_attribute.attribute].dropna(
            )  # np.histogram breaks if array contain NaN
            #TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong.
            counts, bin_edges = np.histogram(series,
                                             bins=bin_attribute.bin_size)
            #bin_edges of size N+1, so need to compute bin_center as the bin location
            bin_center = np.mean(np.vstack([bin_edges[0:-1], bin_edges[1:]]),
                                 axis=0)
            # TODO: Should vis.data be a LuxDataFrame or a Pandas DataFrame?
            vis._vis_data = pd.DataFrame(
                np.array([bin_center, counts]).T,
                columns=[bin_attribute.attribute, "Number of Records"])
Esempio n. 8
0
    def execute_2D_binning(vis: Vis):
        pd.reset_option('mode.chained_assignment')
        with pd.option_context('mode.chained_assignment', None):
            x_attr = vis.get_attr_by_channel("x")[0]
            y_attr = vis.get_attr_by_channel("y")[0]

            vis._vis_data.loc[:,
                              "xBin"] = pd.cut(vis._vis_data[x_attr.attribute],
                                               bins=30)
            vis._vis_data.loc[:,
                              "yBin"] = pd.cut(vis._vis_data[y_attr.attribute],
                                               bins=30)
            groups = vis._vis_data.groupby(['xBin', 'yBin'])[x_attr.attribute]
            result = groups.agg("count").reset_index(
            )  # .agg in this line throws SettingWithCopyWarning
            result = result.rename(columns={x_attr.attribute: "z"})
            result = result[result["z"] != 0]

            # convert type to facilitate weighted correlation interestingess calculation
            result.loc[:, "xBinStart"] = result["xBin"].apply(
                lambda x: x.left).astype('float')
            result.loc[:, "xBinEnd"] = result["xBin"].apply(lambda x: x.right)

            result.loc[:, "yBinStart"] = result["yBin"].apply(
                lambda x: x.left).astype('float')
            result.loc[:, "yBinEnd"] = result["yBin"].apply(lambda x: x.right)

            vis._vis_data = result.drop(columns=["xBin", "yBin"])
Esempio n. 9
0
    def execute_scatter(view: Vis, tbl: LuxSQLTable):
        """
        Given a scatterplot vis and a Lux Dataframe, fetch the data required to render the vis.
        1) Generate WHERE clause for the SQL query
        2) Check number of datapoints to be included in the query
        3) If the number of datapoints exceeds 10000, perform a random sample from the original data
        4) Query datapoints needed for the scatterplot visualization
        5) return a DataFrame with relevant results

        Parameters
        ----------
        vislist: list[lux.Vis]
            vis list that contains lux.Vis objects for visualization.
        tbl : lux.core.frame
            LuxSQLTable with specified intent.

        Returns
        -------
        None
        """

        attributes = set([])
        for clause in view._inferred_intent:
            if clause.attribute:
                if clause.attribute != "Record":
                    attributes.add(clause.attribute)
        where_clause, filterVars = SQLExecutor.execute_filter(view)

        length_query = pandas.read_sql(
            "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause),
            lux.config.SQLconnection,
        )

        def add_quotes(var_name):
            return '"' + var_name + '"'

        required_variables = attributes | set(filterVars)
        required_variables = map(add_quotes, required_variables)
        required_variables = ",".join(required_variables)
        row_count = list(
            pandas.read_sql(
                f"SELECT COUNT(*) FROM {tbl.table_name} {where_clause}",
                lux.config.SQLconnection,
            )["count"]
        )[0]
        if row_count > lux.config.sampling_cap:
            query = f"SELECT {required_variables} FROM {tbl.table_name} {where_clause} ORDER BY random() LIMIT 10000"
        else:
            query = "SELECT {} FROM {} {}".format(required_variables, tbl.table_name, where_clause)
        data = pandas.read_sql(query, lux.config.SQLconnection)
        view._vis_data = utils.pandas_to_lux(data)
        # view._vis_data.length = list(length_query["length"])[0]

        tbl._message.add_unique(
            f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.",
            priority=98,
        )
Esempio n. 10
0
 def execute_filter(vis: Vis):
     assert vis.data is not None, "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)"
     filters = utils.get_filter_specs(vis._inferred_intent)
     
     if (filters):
         # TODO: Need to handle OR logic
         for filter in filters:
             vis._vis_data = PandasExecutor.apply_filter(vis.data, filter.attribute, filter.filter_op, filter.value)
         return True
     else:
         return False
Esempio n. 11
0
    def execute_2D_binning(vis: Vis) -> None:
        """
        Apply 2D binning (heatmap) to vis.data

        Parameters
        ----------
        vis : Vis
        """
        pd.reset_option("mode.chained_assignment")
        with pd.option_context("mode.chained_assignment", None):
            x_attr = vis.get_attr_by_channel("x")[0].attribute
            y_attr = vis.get_attr_by_channel("y")[0].attribute

            vis._vis_data["xBin"] = pd.cut(vis._vis_data[x_attr],
                                           bins=lux.config.heatmap_bin_size)
            vis._vis_data["yBin"] = pd.cut(vis._vis_data[y_attr],
                                           bins=lux.config.heatmap_bin_size)

            color_attr = vis.get_attr_by_channel("color")
            if len(color_attr) > 0:
                color_attr = color_attr[0]
                groups = vis._vis_data.groupby(
                    ["xBin", "yBin"], history=False)[color_attr.attribute]
                if color_attr.data_type == "nominal":
                    # Compute mode and count. Mode aggregates each cell by taking the majority vote for the category variable. In cases where there is ties across categories, pick the first item (.iat[0])
                    result = groups.agg([
                        ("count", "count"),
                        (color_attr.attribute,
                         lambda x: pd.Series.mode(x).iat[0]),
                    ]).reset_index()
                elif color_attr.data_type == "quantitative" or color_attr.data_type == "temporal":
                    # Compute the average of all values in the bin
                    result = groups.agg([("count", "count"),
                                         (color_attr.attribute, "mean")
                                         ]).reset_index()
                result = result.dropna()
            else:
                groups = vis._vis_data.groupby(["xBin", "yBin"],
                                               history=False)[x_attr]
                result = groups.count().reset_index(name=x_attr)
                result = result.rename(columns={x_attr: "count"})
                result = result[result["count"] != 0]

            # convert type to facilitate weighted correlation interestingess calculation
            result["xBinStart"] = result["xBin"].apply(
                lambda x: x.left).astype("float")
            result["xBinEnd"] = result["xBin"].apply(lambda x: x.right)

            result["yBinStart"] = result["yBin"].apply(
                lambda x: x.left).astype("float")
            result["yBinEnd"] = result["yBin"].apply(lambda x: x.right)

            vis._vis_data = result.drop(columns=["xBin", "yBin"])
Esempio n. 12
0
def test_filter():
    df = pd.read_csv("lux/data/car.csv")
    # change pandas dtype for the column "Year" to datetype
    df["Year"] = pd.to_datetime(df["Year"], format="%Y")
    intent = [
        lux.Clause(attribute="Horsepower"),
        lux.Clause(attribute="Year"),
        lux.Clause(attribute="Origin", filter_op="=", value="USA"),
    ]
    vis = Vis(intent, df)
    vis._vis_data = df
    PandasExecutor.execute_filter(vis)
    assert len(vis.data) == len(df[df["Origin"] == "USA"])
Esempio n. 13
0
def test_inequalityfilter():
    tbl = lux.LuxSQLTable()
    tbl.set_SQL_table("cars")

    vis = Vis(
        [
            lux.Clause(attribute="horsepower", filter_op=">", value=50),
            lux.Clause(attribute="milespergal"),
        ]
    )
    vis._vis_data = tbl
    filter_output = SQLExecutor.execute_filter(vis)
    assert filter_output[0] == 'WHERE "horsepower" > \'50\' AND "milespergal" IS NOT NULL'
    assert filter_output[1] == ["horsepower"]

    intent = [
        lux.Clause(attribute="horsepower", filter_op="<=", value=100),
        lux.Clause(attribute="milespergal"),
    ]
    vis = Vis(intent, tbl)
    vis._vis_data = tbl
    filter_output = SQLExecutor.execute_filter(vis)
    assert filter_output[0] == 'WHERE "horsepower" <= \'100\' AND "milespergal" IS NOT NULL'
    assert filter_output[1] == ["horsepower"]
Esempio n. 14
0
    def execute_2D_binning(vis: Vis):
        pd.reset_option('mode.chained_assignment')
        with pd.option_context('mode.chained_assignment', None):
            x_attr = vis.get_attr_by_channel("x")[0]
            y_attr = vis.get_attr_by_channel("y")[0]

            vis._vis_data.loc[:,
                              "xBin"] = pd.cut(vis._vis_data[x_attr.attribute],
                                               bins=40)
            vis._vis_data.loc[:,
                              "yBin"] = pd.cut(vis._vis_data[y_attr.attribute],
                                               bins=40)

            color_attr = vis.get_attr_by_channel("color")
            if (len(color_attr) > 0):
                color_attr = color_attr[0]
                groups = vis._vis_data.groupby(['xBin',
                                                'yBin'])[color_attr.attribute]
                if (color_attr.data_type == "nominal"):
                    # Compute mode and count. Mode aggregates each cell by taking the majority vote for the category variable. In cases where there is ties across categories, pick the first item (.iat[0])
                    result = groups.agg([("count", "count"),
                                         (color_attr.attribute,
                                          lambda x: pd.Series.mode(x).iat[0])
                                         ]).reset_index()
                elif (color_attr.data_type == "quantitative"):
                    # Compute the average of all values in the bin
                    result = groups.agg([("count", "count"),
                                         (color_attr.attribute, "mean")
                                         ]).reset_index()
                result = result.dropna()
            else:
                groups = vis._vis_data.groupby(['xBin',
                                                'yBin'])[x_attr.attribute]
                result = groups.agg("count").reset_index(
                )  # .agg in this line throws SettingWithCopyWarning
                result = result.rename(columns={x_attr.attribute: "count"})
                result = result[result["count"] != 0]

            # convert type to facilitate weighted correlation interestingess calculation
            result.loc[:, "xBinStart"] = result["xBin"].apply(
                lambda x: x.left).astype('float')
            result.loc[:, "xBinEnd"] = result["xBin"].apply(lambda x: x.right)

            result.loc[:, "yBinStart"] = result["yBin"].apply(
                lambda x: x.left).astype('float')
            result.loc[:, "yBinEnd"] = result["yBin"].apply(lambda x: x.right)

            vis._vis_data = result.drop(columns=["xBin", "yBin"])
Esempio n. 15
0
def test_filter(global_var):
    tbl = lux.LuxSQLTable()
    tbl.set_SQL_table("cars")

    intent = [
        lux.Clause(attribute="horsepower"),
        lux.Clause(attribute="year"),
        lux.Clause(attribute="origin", filter_op="=", value="USA"),
    ]
    vis = Vis(intent, tbl)
    vis._vis_data = tbl
    filter_output = SQLExecutor.execute_filter(vis)
    where_clause = filter_output[0]
    where_clause_list = where_clause.split(" AND ")
    assert ("WHERE \"origin\" = 'USA'" in where_clause_list
            and '"horsepower" IS NOT NULL' in where_clause_list
            and '"year" IS NOT NULL' in where_clause_list)
    assert filter_output[1] == ["origin"]
Esempio n. 16
0
def test_filter():
    connection = psycopg2.connect(
        "host=localhost dbname=postgres user=postgres password=lux")
    sql_df = lux.LuxSQLTable()
    lux.config.set_SQL_connection(connection)
    sql_df.set_SQL_table("car")

    intent = [
        lux.Clause(attribute="Horsepower"),
        lux.Clause(attribute="Year"),
        lux.Clause(attribute="Origin", filter_op="=", value="USA"),
    ]
    vis = Vis(intent, sql_df)
    vis._vis_data = sql_df
    filter_output = SQLExecutor.execute_filter(vis)
    assert (
        filter_output[0] ==
        'WHERE "Origin" = \'USA\' AND "Year" IS NOT NULL AND "Horsepower" IS NOT NULL'
    )
    assert filter_output[1] == ["Origin"]
Esempio n. 17
0
    def execute_aggregate(vis: Vis, isFiltered=True):
        '''
        Aggregate data points on an axis for bar or line charts

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.core.frame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        '''
        import numpy as np

        x_attr = vis.get_attr_by_channel("x")[0]
        y_attr = vis.get_attr_by_channel("y")[0]
        has_color = False
        groupby_attr = ""
        measure_attr = ""
        if (x_attr.aggregation is None or y_attr.aggregation is None):
            return
        if (y_attr.aggregation != ""):
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if (x_attr.aggregation != ""):
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        if (groupby_attr.attribute in vis.data.unique_values.keys()):
            attr_unique_vals = vis.data.unique_values[groupby_attr.attribute]
        #checks if color is specified in the Vis
        if len(vis.get_attr_by_channel("color")) == 1:
            color_attr = vis.get_attr_by_channel("color")[0]
            color_attr_vals = vis.data.unique_values[color_attr.attribute]
            color_cardinality = len(color_attr_vals)
            #NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable----------------
            has_color = True
        else:
            color_cardinality = 1

        if (measure_attr != ""):
            if (measure_attr.attribute == "Record"):
                vis._vis_data = vis.data.reset_index()
                #if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    vis._vis_data = vis.data.groupby(
                        [groupby_attr.attribute,
                         color_attr.attribute]).count().reset_index()
                    vis._vis_data = vis.data.rename(
                        columns={"index": "Record"})
                    vis._vis_data = vis.data[[
                        groupby_attr.attribute, color_attr.attribute, "Record"
                    ]]
                else:
                    vis._vis_data = vis.data.groupby(
                        groupby_attr.attribute).count().reset_index()
                    vis._vis_data = vis.data.rename(
                        columns={"index": "Record"})
                    vis._vis_data = vis.data[[
                        groupby_attr.attribute, "Record"
                    ]]
            else:
                #if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    groupby_result = vis.data.groupby(
                        [groupby_attr.attribute, color_attr.attribute])
                else:
                    groupby_result = vis.data.groupby(groupby_attr.attribute)
                groupby_result = groupby_result.agg(agg_func)
                intermediate = groupby_result.reset_index()
                vis._vis_data = intermediate.__finalize__(vis.data)
            result_vals = list(vis.data[groupby_attr.attribute])
            #create existing group by attribute combinations if color is specified
            #this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them
            if has_color:
                res_color_combi_vals = []
                result_color_vals = list(vis.data[color_attr.attribute])
                for i in range(0, len(result_vals)):
                    res_color_combi_vals.append(
                        [result_vals[i], result_color_vals[i]])
            # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
            if (isFiltered or has_color and attr_unique_vals):
                N_unique_vals = len(attr_unique_vals)
                if (len(result_vals) != N_unique_vals * color_cardinality):
                    columns = vis.data.columns
                    if has_color:
                        df = pd.DataFrame({
                            columns[0]:
                            attr_unique_vals * color_cardinality,
                            columns[1]:
                            pd.Series(color_attr_vals).repeat(N_unique_vals)
                        })
                        vis._vis_data = vis.data.merge(
                            df,
                            on=[columns[0], columns[1]],
                            how='right',
                            suffixes=['', '_right'])
                        for col in columns[2:]:
                            vis.data[col] = vis.data[col].fillna(
                                0)  #Triggers __setitem__
                        assert len(
                            list(vis.data[groupby_attr.attribute])
                        ) == N_unique_vals * len(
                            color_attr_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`."
                        vis._vis_data = vis.data.iloc[:, :
                                                      3]  # Keep only the three relevant columns not the *_right columns resulting from merge
                    else:
                        df = pd.DataFrame({columns[0]: attr_unique_vals})

                        vis._vis_data = vis.data.merge(df,
                                                       on=columns[0],
                                                       how='right',
                                                       suffixes=['', '_right'])

                        for col in columns[1:]:
                            vis.data[col] = vis.data[col].fillna(0)
                        assert len(
                            list(vis.data[groupby_attr.attribute])
                        ) == N_unique_vals, f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
            vis._vis_data = vis.data.sort_values(by=groupby_attr.attribute,
                                                 ascending=True)
            vis._vis_data = vis.data.reset_index()
            vis._vis_data = vis.data.drop(columns="index")
Esempio n. 18
0
    def execute_binning(view: Vis, tbl: LuxSQLTable):
        """
        Binning of data points for generating histograms
        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        tbl : lux.core.frame
            LuxSQLTable with specified intent.
        Returns
        -------
        None
        """
        import numpy as np

        bin_attribute = list(filter(lambda x: x.bin_size != 0, view._inferred_intent))[0]

        num_bins = bin_attribute.bin_size
        attr_min = tbl._min_max[bin_attribute.attribute][0]
        attr_max = tbl._min_max[bin_attribute.attribute][1]
        attr_type = type(tbl.unique_values[bin_attribute.attribute][0])

        # get filters if available
        where_clause, filterVars = SQLExecutor.execute_filter(view)

        length_query = pandas.read_sql(lux.config.query_templates['length_query'].format(table_name = tbl.table_name, where_clause = where_clause),lux.config.SQLconnection,)

        bin_width = (attr_max - attr_min) / num_bins
        upper_edges = []
        for e in range(1, num_bins):
            curr_edge = attr_min + e * bin_width
            if attr_type == int:
                upper_edges.append(str(math.ceil(curr_edge)))
            else:
                upper_edges.append(str(curr_edge))
        upper_edges = ",".join(upper_edges)
        view_filter, filter_vars = SQLExecutor.execute_filter(view)

        #handling for non postgres case
        if "cases" in lux.config.query_templates['histogram_counts']:
            bucket_edges = [attr_min]
            for e in range(1, num_bins):
                curr_edge = attr_min + e * bin_width
                bucket_edges.append(str(curr_edge))
            bucket_edges.append(attr_max)

            when_line = "WHEN {column} BETWEEN {lower_edge} AND {upper_edge} THEN {label}"
            when_lines = "CASE "
            for i in range(1, len(bucket_edges)):
                when_lines = when_lines + when_line.format(column = bin_attribute.attribute, lower_edge = bucket_edges[i-1], upper_edge = bucket_edges[i], label = str(i-1)) + " "

            when_lines = when_lines + "end"

            #hist_query = "select width_bucket, count(width_bucket) as count from (select ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets group by width_bucket order by width_bucket"
            bin_count_query = lux.config.query_templates['histogram_counts'].format(bucket_cases = when_lines, table_name = tbl.table_name, where_clause = where_clause)
        # need to calculate the bin edges before querying for the relevant data
        else:
            bin_count_query = lux.config.query_templates['histogram_counts'].format(bin_attribute = bin_attribute.attribute,upper_edges = "{" + upper_edges + "}",table_name = tbl.table_name,where_clause = where_clause,)

        bin_count_data = pandas.read_sql(bin_count_query, lux.config.SQLconnection)
        assert((len(bin_count_data.columns) ==2) & (set(['width_bucket', 'count']).issubset(bin_count_data.columns)))
        if not bin_count_data["width_bucket"].isnull().values.any():
            # np.histogram breaks if data contain NaN

            # counts,binEdges = np.histogram(tbl[bin_attribute.attribute],bins=bin_attribute.bin_size)
            # binEdges of size N+1, so need to compute binCenter as the bin location
            upper_edges = [float(i) for i in upper_edges.split(",")]
            if attr_type == int:
                bin_centers = np.array([math.ceil((attr_min + attr_min + bin_width) / 2)])
            else:
                bin_centers = np.array([(attr_min + attr_min + bin_width) / 2])
            bin_centers = np.append(bin_centers,np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0),)
            if attr_type == int:
                bin_centers = np.append(bin_centers,math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2),)
            else:
                bin_centers = np.append(bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2)

            if len(bin_centers) > len(bin_count_data):
                bucket_lables = bin_count_data["width_bucket"].unique()
                for i in range(0, len(bin_centers)):
                    if i not in bucket_lables:
                        bin_count_data = bin_count_data.append(pandas.DataFrame([[i, 0]], columns=bin_count_data.columns))

            view._vis_data = pandas.DataFrame(np.array([bin_centers, list(bin_count_data["count"])]).T,columns=[bin_attribute.attribute, "Number of Records"],)
            view._vis_data = utils.pandas_to_lux(view.data)
Esempio n. 19
0
    def execute_2D_binning(view: Vis, tbl: LuxSQLTable):
        import numpy as np

        x_attribute = list(filter(lambda x: x.channel == "x", view._inferred_intent))[0]

        y_attribute = list(filter(lambda x: x.channel == "y", view._inferred_intent))[0]

        num_bins = lux.config.heatmap_bin_size
        x_attr_min = tbl._min_max[x_attribute.attribute][0]
        x_attr_max = tbl._min_max[x_attribute.attribute][1]
        x_attr_type = type(tbl.unique_values[x_attribute.attribute][0])

        y_attr_min = tbl._min_max[y_attribute.attribute][0]
        y_attr_max = tbl._min_max[y_attribute.attribute][1]
        y_attr_type = type(tbl.unique_values[y_attribute.attribute][0])

        # get filters if available
        where_clause, filterVars = SQLExecutor.execute_filter(view)

        # need to calculate the bin edges before querying for the relevant data
        x_bin_width = (x_attr_max - x_attr_min) / num_bins
        y_bin_width = (y_attr_max - y_attr_min) / num_bins

        x_upper_edges = []
        y_upper_edges = []
        for e in range(0, num_bins):
            x_curr_edge = x_attr_min + e * x_bin_width
            y_curr_edge = y_attr_min + e * y_bin_width
            # get upper edges for x attribute bins
            if x_attr_type == int:
                x_upper_edges.append(math.ceil(x_curr_edge))
            else:
                x_upper_edges.append(x_curr_edge)
            # get upper edges for y attribute bins
            if y_attr_type == int:
                y_upper_edges.append(str(math.ceil(y_curr_edge)))
            else:
                y_upper_edges.append(str(y_curr_edge))
        x_upper_edges_string = [str(int) for int in x_upper_edges]
        x_upper_edges_string = ",".join(x_upper_edges_string)
        y_upper_edges_string = ",".join(y_upper_edges)

        if "cases" in lux.config.query_templates['histogram_counts']:
            x_bucket_edges = [x_attr_min]
            y_bucket_edges = [y_attr_min]
            for e in range(1, num_bins):
                x_curr_edge = x_attr_min + e * x_bin_width
                x_bucket_edges.append(str(x_curr_edge))

                y_curr_edge = y_attr_min + e * y_bin_width
                y_bucket_edges.append(str(y_curr_edge))
            x_bucket_edges.append(x_attr_max)
            y_bucket_edges.append(y_attr_max)

            when_line = "WHEN {column} BETWEEN {lower_edge} AND {upper_edge} THEN {label}"
            x_when_lines = "CASE "
            y_when_lines = "CASE "
            for i in range(1, len(x_bucket_edges)):
                x_when_lines = x_when_lines + when_line.format(column = x_attribute.attribute, lower_edge = x_bucket_edges[i-1], upper_edge = x_bucket_edges[i], label = str(i-1)) + " "
                y_when_lines = y_when_lines + when_line.format(column = y_attribute.attribute, lower_edge = y_bucket_edges[i-1], upper_edge = y_bucket_edges[i], label = str(i-1)) + " "
            x_when_lines = x_when_lines + "end"
            y_when_lines = y_when_lines + "end"

            #hist_query = "select width_bucket, count(width_bucket) as count from (select ({bucket_cases}) as width_bucket from {table_name} {where_clause}) as buckets group by width_bucket order by width_bucket"
            bin_count_query = lux.config.query_templates['heatmap_counts'].format(bucket_cases1 = x_when_lines, bucket_cases2 = y_when_lines, table_name = tbl.table_name, where_clause = where_clause)

        else:
            bin_count_query = lux.config.query_templates['heatmap_counts'].format(x_attribute = x_attribute.attribute,x_upper_edges_string = "{" + x_upper_edges_string + "}",y_attribute = y_attribute.attribute,y_upper_edges_string = "{" + y_upper_edges_string + "}",table_name = tbl.table_name,where_clause = where_clause,)

        # data = pandas.read_sql(bin_count_query, lux.config.SQLconnection)

        data = pandas.read_sql(bin_count_query, lux.config.SQLconnection)
        assert((len(data.columns) == 3) & (set(['width_bucket1', 'width_bucket2', 'count']).issubset(data.columns)))
        # data = data[data["width_bucket1"] != num_bins - 1]
        # data = data[data["width_bucket2"] != num_bins - 1]
        if len(data) > 0:
            data["xBinStart"] = data.apply(lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]) - x_bin_width, axis=1)
            data["xBinEnd"] = data.apply(lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]), axis=1)
            data["yBinStart"] = data.apply(lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]) - y_bin_width, axis=1)
            data["yBinEnd"] = data.apply(lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]), axis=1)
        view._vis_data = utils.pandas_to_lux(data)
Esempio n. 20
0
    def execute_aggregate(view: Vis, tbl: LuxSQLTable, isFiltered=True):
        """
        Aggregate data points on an axis for bar or line charts
        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        tbl : lux.core.frame
            LuxSQLTable with specified intent.
        isFiltered: boolean
            boolean that represents whether a vis has had a filter applied to its data
        Returns
        -------
        None
        """
        x_attr = view.get_attr_by_channel("x")[0]
        y_attr = view.get_attr_by_channel("y")[0]
        has_color = False
        groupby_attr = ""
        measure_attr = ""
        if x_attr.aggregation is None or y_attr.aggregation is None:
            return
        if y_attr.aggregation != "":
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if x_attr.aggregation != "":
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        if groupby_attr.attribute in tbl.unique_values.keys():
            attr_unique_vals = tbl.unique_values[groupby_attr.attribute]
        # checks if color is specified in the Vis
        if len(view.get_attr_by_channel("color")) == 1:
            color_attr = view.get_attr_by_channel("color")[0]
            color_attr_vals = tbl.unique_values[color_attr.attribute]
            color_cardinality = len(color_attr_vals)
            # NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable----------------
            has_color = True
        else:
            color_cardinality = 1
        if measure_attr != "":
            # barchart case, need count data for each group
            if measure_attr.attribute == "Record":
                where_clause, filterVars = SQLExecutor.execute_filter(view)

                length_query = pandas.read_sql(
                    "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause),
                    lux.config.SQLconnection,
                )
                # generates query for colored barchart case
                if has_color:
                    count_query = 'SELECT "{}", "{}", COUNT("{}") FROM {} {} GROUP BY "{}", "{}"'.format(
                        groupby_attr.attribute,
                        color_attr.attribute,
                        groupby_attr.attribute,
                        tbl.table_name,
                        where_clause,
                        groupby_attr.attribute,
                        color_attr.attribute,
                    )
                    view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection)
                    view._vis_data = view._vis_data.rename(columns={"count": "Record"})
                    view._vis_data = utils.pandas_to_lux(view._vis_data)
                # generates query for normal barchart case
                else:
                    count_query = 'SELECT "{}", COUNT("{}") FROM {} {} GROUP BY "{}"'.format(
                        groupby_attr.attribute,
                        groupby_attr.attribute,
                        tbl.table_name,
                        where_clause,
                        groupby_attr.attribute,
                    )
                    view._vis_data = pandas.read_sql(count_query, lux.config.SQLconnection)
                    view._vis_data = view._vis_data.rename(columns={"count": "Record"})
                    view._vis_data = utils.pandas_to_lux(view._vis_data)
                # view._vis_data.length = list(length_query["length"])[0]
            # aggregate barchart case, need aggregate data (mean, sum, max) for each group
            else:
                where_clause, filterVars = SQLExecutor.execute_filter(view)

                length_query = pandas.read_sql(
                    "SELECT COUNT(*) as length FROM {} {}".format(tbl.table_name, where_clause),
                    lux.config.SQLconnection,
                )
                # generates query for colored barchart case
                if has_color:
                    if agg_func == "mean":
                        agg_query = (
                            'SELECT "{}", "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format(
                                groupby_attr.attribute,
                                color_attr.attribute,
                                measure_attr.attribute,
                                measure_attr.attribute,
                                tbl.table_name,
                                where_clause,
                                groupby_attr.attribute,
                                color_attr.attribute,
                            )
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)

                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                    if agg_func == "sum":
                        agg_query = (
                            'SELECT "{}", "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format(
                                groupby_attr.attribute,
                                color_attr.attribute,
                                measure_attr.attribute,
                                measure_attr.attribute,
                                tbl.table_name,
                                where_clause,
                                groupby_attr.attribute,
                                color_attr.attribute,
                            )
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                    if agg_func == "max":
                        agg_query = (
                            'SELECT "{}", "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}", "{}"'.format(
                                groupby_attr.attribute,
                                color_attr.attribute,
                                measure_attr.attribute,
                                measure_attr.attribute,
                                tbl.table_name,
                                where_clause,
                                groupby_attr.attribute,
                                color_attr.attribute,
                            )
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                # generates query for normal barchart case
                else:
                    if agg_func == "mean":
                        agg_query = 'SELECT "{}", AVG("{}") as "{}" FROM {} {} GROUP BY "{}"'.format(
                            groupby_attr.attribute,
                            measure_attr.attribute,
                            measure_attr.attribute,
                            tbl.table_name,
                            where_clause,
                            groupby_attr.attribute,
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                    if agg_func == "sum":
                        agg_query = 'SELECT "{}", SUM("{}") as "{}" FROM {} {} GROUP BY "{}"'.format(
                            groupby_attr.attribute,
                            measure_attr.attribute,
                            measure_attr.attribute,
                            tbl.table_name,
                            where_clause,
                            groupby_attr.attribute,
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
                    if agg_func == "max":
                        agg_query = 'SELECT "{}", MAX("{}") as "{}" FROM {} {} GROUP BY "{}"'.format(
                            groupby_attr.attribute,
                            measure_attr.attribute,
                            measure_attr.attribute,
                            tbl.table_name,
                            where_clause,
                            groupby_attr.attribute,
                        )
                        view._vis_data = pandas.read_sql(agg_query, lux.config.SQLconnection)
                        view._vis_data = utils.pandas_to_lux(view._vis_data)
            result_vals = list(view._vis_data[groupby_attr.attribute])
            # create existing group by attribute combinations if color is specified
            # this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them
            if has_color:
                res_color_combi_vals = []
                result_color_vals = list(view._vis_data[color_attr.attribute])
                for i in range(0, len(result_vals)):
                    res_color_combi_vals.append([result_vals[i], result_color_vals[i]])
            # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
            if isFiltered or has_color and attr_unique_vals:
                N_unique_vals = len(attr_unique_vals)
                if len(result_vals) != N_unique_vals * color_cardinality:
                    columns = view._vis_data.columns
                    if has_color:
                        df = pandas.DataFrame(
                            {
                                columns[0]: attr_unique_vals * color_cardinality,
                                columns[1]: pandas.Series(color_attr_vals).repeat(N_unique_vals),
                            }
                        )
                        view._vis_data = view._vis_data.merge(
                            df,
                            on=[columns[0], columns[1]],
                            how="right",
                            suffixes=["", "_right"],
                        )
                        for col in columns[2:]:
                            view._vis_data[col] = view._vis_data[col].fillna(0)  # Triggers __setitem__
                        assert len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals * len(
                            color_attr_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`."
                        view._vis_data = view._vis_data.iloc[
                            :, :3
                        ]  # Keep only the three relevant columns not the *_right columns resulting from merge
                    else:
                        df = pandas.DataFrame({columns[0]: attr_unique_vals})

                        view._vis_data = view._vis_data.merge(
                            df, on=columns[0], how="right", suffixes=["", "_right"]
                        )

                        for col in columns[1:]:
                            view._vis_data[col] = view._vis_data[col].fillna(0)
                        assert (
                            len(list(view._vis_data[groupby_attr.attribute])) == N_unique_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
            view._vis_data = view._vis_data.sort_values(by=groupby_attr.attribute, ascending=True)
            view._vis_data = view._vis_data.reset_index()
            view._vis_data = view._vis_data.drop(columns="index")
Esempio n. 21
0
    def execute_binning(view: Vis, tbl: LuxSQLTable):
        """
        Binning of data points for generating histograms
        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        tbl : lux.core.frame
            LuxSQLTable with specified intent.
        Returns
        -------
        None
        """
        import numpy as np

        bin_attribute = list(filter(lambda x: x.bin_size != 0, view._inferred_intent))[0]

        num_bins = bin_attribute.bin_size
        attr_min = tbl._min_max[bin_attribute.attribute][0]
        attr_max = tbl._min_max[bin_attribute.attribute][1]
        attr_type = type(tbl.unique_values[bin_attribute.attribute][0])

        # get filters if available
        where_clause, filterVars = SQLExecutor.execute_filter(view)

        length_query = pandas.read_sql(
            "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause),
            lux.config.SQLconnection,
        )
        # need to calculate the bin edges before querying for the relevant data
        bin_width = (attr_max - attr_min) / num_bins
        upper_edges = []
        for e in range(1, num_bins):
            curr_edge = attr_min + e * bin_width
            if attr_type == int:
                upper_edges.append(str(math.ceil(curr_edge)))
            else:
                upper_edges.append(str(curr_edge))
        upper_edges = ",".join(upper_edges)
        view_filter, filter_vars = SQLExecutor.execute_filter(view)
        bin_count_query = "SELECT width_bucket, COUNT(width_bucket) FROM (SELECT width_bucket(CAST (\"{}\" AS FLOAT), '{}') FROM {} {}) as Buckets GROUP BY width_bucket ORDER BY width_bucket".format(
            bin_attribute.attribute,
            "{" + upper_edges + "}",
            tbl.table_name,
            where_clause,
        )

        bin_count_data = pandas.read_sql(bin_count_query, lux.config.SQLconnection)
        if not bin_count_data["width_bucket"].isnull().values.any():
            # np.histogram breaks if data contain NaN

            # counts,binEdges = np.histogram(tbl[bin_attribute.attribute],bins=bin_attribute.bin_size)
            # binEdges of size N+1, so need to compute binCenter as the bin location
            upper_edges = [float(i) for i in upper_edges.split(",")]
            if attr_type == int:
                bin_centers = np.array([math.ceil((attr_min + attr_min + bin_width) / 2)])
            else:
                bin_centers = np.array([(attr_min + attr_min + bin_width) / 2])
            bin_centers = np.append(
                bin_centers,
                np.mean(np.vstack([upper_edges[0:-1], upper_edges[1:]]), axis=0),
            )
            if attr_type == int:
                bin_centers = np.append(
                    bin_centers,
                    math.ceil((upper_edges[len(upper_edges) - 1] + attr_max) / 2),
                )
            else:
                bin_centers = np.append(bin_centers, (upper_edges[len(upper_edges) - 1] + attr_max) / 2)

            if len(bin_centers) > len(bin_count_data):
                bucket_lables = bin_count_data["width_bucket"].unique()
                for i in range(0, len(bin_centers)):
                    if i not in bucket_lables:
                        bin_count_data = bin_count_data.append(
                            pandas.DataFrame([[i, 0]], columns=bin_count_data.columns)
                        )
            view._vis_data = pandas.DataFrame(
                np.array([bin_centers, list(bin_count_data["count"])]).T,
                columns=[bin_attribute.attribute, "Number of Records"],
            )
            view._vis_data = utils.pandas_to_lux(view.data)
Esempio n. 22
0
    def execute_aggregate(vis: Vis, isFiltered=True):
        """
        Aggregate data points on an axis for bar or line charts

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization
        ldf : lux.core.frame
            LuxDataFrame with specified intent.

        Returns
        -------
        None
        """
        import numpy as np

        x_attr = vis.get_attr_by_channel("x")[0]
        y_attr = vis.get_attr_by_channel("y")[0]
        has_color = False
        groupby_attr = ""
        measure_attr = ""
        if x_attr.aggregation is None or y_attr.aggregation is None:
            return
        if y_attr.aggregation != "":
            groupby_attr = x_attr
            measure_attr = y_attr
            agg_func = y_attr.aggregation
        if x_attr.aggregation != "":
            groupby_attr = y_attr
            measure_attr = x_attr
            agg_func = x_attr.aggregation
        if groupby_attr.attribute in vis.data.unique_values.keys():
            attr_unique_vals = vis.data.unique_values[groupby_attr.attribute]
        # checks if color is specified in the Vis
        if len(vis.get_attr_by_channel("color")) == 1:
            color_attr = vis.get_attr_by_channel("color")[0]
            color_attr_vals = vis.data.unique_values[color_attr.attribute]
            color_cardinality = len(color_attr_vals)
            # NOTE: might want to have a check somewhere to not use categorical variables with greater than some number of categories as a Color variable----------------
            has_color = True
        else:
            color_cardinality = 1
        if measure_attr != "":
            if measure_attr.attribute == "Record":
                # need to get the index name so that we can rename the index column to "Record"
                # if there is no index, default to "index"
                index_name = vis.data.index.name
                if index_name == None:
                    index_name = "index"

                vis._vis_data = vis.data.reset_index()
                # if color is specified, need to group by groupby_attr and color_attr

                if has_color:
                    vis._vis_data = (vis.data.groupby(
                        [groupby_attr.attribute, color_attr.attribute],
                        dropna=False,
                        history=False).count().reset_index().rename(
                            columns={index_name: "Record"}))
                    vis._vis_data = vis.data[[
                        groupby_attr.attribute, color_attr.attribute, "Record"
                    ]]
                else:
                    vis._vis_data = (vis.data.groupby(
                        groupby_attr.attribute, dropna=False,
                        history=False).count().reset_index().rename(
                            columns={index_name: "Record"}))
                    vis._vis_data = vis.data[[
                        groupby_attr.attribute, "Record"
                    ]]
            else:
                # if color is specified, need to group by groupby_attr and color_attr
                if has_color:
                    groupby_result = vis.data.groupby(
                        [groupby_attr.attribute, color_attr.attribute],
                        dropna=False,
                        history=False)
                else:
                    groupby_result = vis.data.groupby(groupby_attr.attribute,
                                                      dropna=False,
                                                      history=False)
                groupby_result = groupby_result.agg(agg_func)
                intermediate = groupby_result.reset_index()
                vis._vis_data = intermediate.__finalize__(vis.data)
            result_vals = list(vis.data[groupby_attr.attribute])
            # create existing group by attribute combinations if color is specified
            # this is needed to check what combinations of group_by_attr and color_attr values have a non-zero number of elements in them
            if has_color:
                res_color_combi_vals = []
                result_color_vals = list(vis.data[color_attr.attribute])
                for i in range(0, len(result_vals)):
                    res_color_combi_vals.append(
                        [result_vals[i], result_color_vals[i]])
            # For filtered aggregation that have missing groupby-attribute values, set these aggregated value as 0, since no datapoints
            if isFiltered or has_color and attr_unique_vals:
                N_unique_vals = len(attr_unique_vals)
                if len(result_vals) != N_unique_vals * color_cardinality:
                    columns = vis.data.columns
                    if has_color:
                        df = pd.DataFrame({
                            columns[0]:
                            attr_unique_vals * color_cardinality,
                            columns[1]:
                            pd.Series(color_attr_vals).repeat(N_unique_vals),
                        })
                        vis._vis_data = vis.data.merge(
                            df,
                            on=[columns[0], columns[1]],
                            how="right",
                            suffixes=["", "_right"],
                        )
                        for col in columns[2:]:
                            vis.data[col] = vis.data[col].fillna(
                                0)  # Triggers __setitem__
                        assert len(
                            list(vis.data[groupby_attr.attribute])
                        ) == N_unique_vals * len(
                            color_attr_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute, color_attr.attribute}`."

                        # Keep only the three relevant columns not the *_right columns resulting from merge
                        vis._vis_data = vis.data.iloc[:, :3]

                    else:
                        df = pd.DataFrame({columns[0]: attr_unique_vals})

                        vis._vis_data = vis.data.merge(df,
                                                       on=columns[0],
                                                       how="right",
                                                       suffixes=["", "_right"])

                        for col in columns[1:]:
                            vis.data[col] = vis.data[col].fillna(0)
                        assert (
                            len(list(vis.data[
                                groupby_attr.attribute])) == N_unique_vals
                        ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."

            vis._vis_data = vis._vis_data.dropna(
                subset=[measure_attr.attribute])
            try:
                vis._vis_data = vis._vis_data.sort_values(
                    by=groupby_attr.attribute, ascending=True)
            except TypeError:
                warnings.warn(
                    f"\nLux detects that the attribute '{groupby_attr.attribute}' maybe contain mixed type."
                    +
                    f"\nTo visualize this attribute, you may want to convert the '{groupby_attr.attribute}' into a uniform type as follows:"
                    +
                    f"\n\tdf['{groupby_attr.attribute}'] = df['{groupby_attr.attribute}'].astype(str)"
                )
                vis._vis_data[groupby_attr.attribute] = vis._vis_data[
                    groupby_attr.attribute].astype(str)
                vis._vis_data = vis._vis_data.sort_values(
                    by=groupby_attr.attribute, ascending=True)
            vis._vis_data = vis._vis_data.reset_index()
            vis._vis_data = vis._vis_data.drop(columns="index")
Esempio n. 23
0
    def execute_2D_binning(view: Vis, tbl: LuxSQLTable):
        import numpy as np

        x_attribute = list(filter(lambda x: x.channel == "x", view._inferred_intent))[0]

        y_attribute = list(filter(lambda x: x.channel == "y", view._inferred_intent))[0]

        num_bins = lux.config.heatmap_bin_size
        x_attr_min = tbl._min_max[x_attribute.attribute][0]
        x_attr_max = tbl._min_max[x_attribute.attribute][1]
        x_attr_type = type(tbl.unique_values[x_attribute.attribute][0])

        y_attr_min = tbl._min_max[y_attribute.attribute][0]
        y_attr_max = tbl._min_max[y_attribute.attribute][1]
        y_attr_type = type(tbl.unique_values[y_attribute.attribute][0])

        # get filters if available
        where_clause, filterVars = SQLExecutor.execute_filter(view)

        # need to calculate the bin edges before querying for the relevant data
        x_bin_width = (x_attr_max - x_attr_min) / num_bins
        y_bin_width = (y_attr_max - y_attr_min) / num_bins

        x_upper_edges = []
        y_upper_edges = []
        for e in range(0, num_bins):
            x_curr_edge = x_attr_min + e * x_bin_width
            y_curr_edge = y_attr_min + e * y_bin_width
            # get upper edges for x attribute bins
            if x_attr_type == int:
                x_upper_edges.append(math.ceil(x_curr_edge))
            else:
                x_upper_edges.append(x_curr_edge)
            # get upper edges for y attribute bins
            if y_attr_type == int:
                y_upper_edges.append(str(math.ceil(y_curr_edge)))
            else:
                y_upper_edges.append(str(y_curr_edge))
        x_upper_edges_string = [str(int) for int in x_upper_edges]
        x_upper_edges_string = ",".join(x_upper_edges_string)
        y_upper_edges_string = ",".join(y_upper_edges)

        bin_count_query = "SELECT width_bucket1, width_bucket2, count(*) FROM (SELECT width_bucket(CAST (\"{}\" AS FLOAT), '{}') as width_bucket1, width_bucket(CAST (\"{}\" AS FLOAT), '{}') as width_bucket2 FROM {} {}) as foo GROUP BY width_bucket1, width_bucket2".format(
            x_attribute.attribute,
            "{" + x_upper_edges_string + "}",
            y_attribute.attribute,
            "{" + y_upper_edges_string + "}",
            tbl.table_name,
            where_clause,
        )

        # data = pandas.read_sql(bin_count_query, lux.config.SQLconnection)

        data = pandas.read_sql(bin_count_query, lux.config.SQLconnection)
        # data = data[data["width_bucket1"] != num_bins - 1]
        # data = data[data["width_bucket2"] != num_bins - 1]
        if len(data) > 0:
            data["xBinStart"] = data.apply(
                lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]) - x_bin_width, axis=1
            )
            data["xBinEnd"] = data.apply(
                lambda row: float(x_upper_edges[int(row["width_bucket1"]) - 1]), axis=1
            )
            data["yBinStart"] = data.apply(
                lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]) - y_bin_width, axis=1
            )
            data["yBinEnd"] = data.apply(
                lambda row: float(y_upper_edges[int(row["width_bucket2"]) - 1]), axis=1
            )
        view._vis_data = utils.pandas_to_lux(data)