コード例 #1
0
ファイル: Compiler.py プロジェクト: WZTT-xh/lux
    def populate_wildcard_options(_inferred_intent: List[Clause], ldf: LuxDataFrame) -> dict:
        """
        Given wildcards and constraints in the LuxDataFrame's intent,
        return the list of available values that satisfies the data_type or data_model constraints.

        Parameters
        ----------
        ldf : LuxDataFrame
                LuxDataFrame with row or attributes populated with available wildcard options.

        Returns
        -------
        intent: Dict[str,list]
                a dictionary that holds the attributes and filters generated from wildcards and constraints.
        """
        import copy
        from lux.utils.utils import convert_to_list

        intent = {"attributes": [], "filters": []}
        for clause in _inferred_intent:
            spec_options = []
            if clause.value == "":  # attribute
                if clause.attribute == "?":
                    options = set(list(ldf.columns))  # all attributes
                    if clause.data_type != "":
                        options = options.intersection(set(ldf.data_type[clause.data_type]))
                    if clause.data_model != "":
                        options = options.intersection(set(ldf.data_model[clause.data_model]))
                    options = list(options)
                else:
                    options = convert_to_list(clause.attribute)
                for optStr in options:
                    if str(optStr) not in clause.exclude:
                        spec_copy = copy.copy(clause)
                        spec_copy.attribute = optStr
                        spec_options.append(spec_copy)
                intent["attributes"].append(spec_options)
            else:  # filters
                attr_lst = convert_to_list(clause.attribute)
                for attr in attr_lst:
                    options = []
                    if clause.value == "?":
                        options = ldf.unique_values[attr]
                        specInd = _inferred_intent.index(clause)
                        _inferred_intent[specInd] = Clause(
                            attribute=clause.attribute,
                            filter_op="=",
                            value=list(options),
                        )
                    else:
                        options.extend(convert_to_list(clause.value))
                    for optStr in options:
                        if str(optStr) not in clause.exclude:
                            spec_copy = copy.copy(clause)
                            spec_copy.attribute = attr
                            spec_copy.value = optStr
                            spec_options.append(spec_copy)
                intent["filters"].extend(spec_options)

        return intent
コード例 #2
0
    def determine_encoding(ldf: LuxDataFrame, vis: Vis):
        '''
		Populates Vis with the appropriate mark type and channel information based on ShowMe logic
		Currently support up to 3 dimensions or measures
		
		Parameters
		----------
		ldf : lux.luxDataFrame.LuxDataFrame
			LuxDataFrame with underspecified intent
		vis : lux.vis.Vis

		Returns
		-------
		None

		Notes
		-----
		Implementing automatic encoding from Tableau's VizQL
		Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007).
		Show Me: Automatic presentation for visual analysis.
		IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144.
		https://doi.org/10.1109/TVCG.2007.70594
		'''
        # Count number of measures and dimensions
        ndim = 0
        nmsr = 0
        filters = []
        for clause in vis._inferred_intent:
            if (clause.value == ""):
                if (clause.data_model == "dimension"):
                    ndim += 1
                elif (clause.data_model == "measure"
                      and clause.attribute != "Record"):
                    nmsr += 1
            else:  # preserve to add back to _inferred_intent later
                filters.append(clause)
        # Helper function (TODO: Move this into utils)
        def line_or_bar(ldf, dimension: Clause, measure: Clause):
            dim_type = dimension.data_type
            # If no aggregation function is specified, then default as average
            if (measure.aggregation == ""):
                measure.set_aggregation("mean")
            if (dim_type == "temporal" or dim_type == "oridinal"):
                return "line", {"x": dimension, "y": measure}
            else:  # unordered categorical
                # if cardinality large than 5 then sort bars
                if ldf.cardinality[dimension.attribute] > 5:
                    dimension.sort = "ascending"
                return "bar", {"x": measure, "y": dimension}

        # ShowMe logic + additional heuristics
        #count_col = Clause( attribute="count()", data_model="measure")
        count_col = Clause(attribute="Record",
                           aggregation="count",
                           data_model="measure",
                           data_type="quantitative")
        auto_channel = {}
        if (ndim == 0 and nmsr == 1):
            # Histogram with Count
            measure = vis.get_attr_by_data_model("measure",
                                                 exclude_record=True)[0]
            if (len(vis.get_attr_by_attr_name("Record")) < 0):
                vis._inferred_intent.append(count_col)
            # If no bin specified, then default as 10
            if (measure.bin_size == 0):
                measure.bin_size = 10
            auto_channel = {"x": measure, "y": count_col}
            vis.mark = "histogram"
        elif (ndim == 1 and (nmsr == 0 or nmsr == 1)):
            # Line or Bar Chart
            if (nmsr == 0):
                vis._inferred_intent.append(count_col)
            dimension = vis.get_attr_by_data_model("dimension")[0]
            measure = vis.get_attr_by_data_model("measure")[0]
            vis.mark, auto_channel = line_or_bar(ldf, dimension, measure)
        elif (ndim == 2 and (nmsr == 0 or nmsr == 1)):
            # Line or Bar chart broken down by the dimension
            dimensions = vis.get_attr_by_data_model("dimension")
            d1 = dimensions[0]
            d2 = dimensions[1]
            if (ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]):
                # d1.channel = "color"
                vis.remove_column_from_spec(d1.attribute)
                dimension = d2
                color_attr = d1
            else:
                if (d1.attribute == d2.attribute):
                    vis._inferred_intent.pop(
                        0
                    )  # if same attribute then remove_column_from_spec will remove both dims, we only want to remove one
                else:
                    vis.remove_column_from_spec(d2.attribute)
                dimension = d1
                color_attr = d2
            # Colored Bar/Line chart with Count as default measure
            if (nmsr == 0):
                vis._inferred_intent.append(count_col)
            measure = vis.get_attr_by_data_model("measure")[0]
            vis.mark, auto_channel = line_or_bar(ldf, dimension, measure)
            auto_channel["color"] = color_attr
        elif (ndim == 0 and nmsr == 2):
            # Scatterplot
            vis.mark = "scatter"
            vis._inferred_intent[0].set_aggregation(None)
            vis._inferred_intent[1].set_aggregation(None)
            auto_channel = {
                "x": vis._inferred_intent[0],
                "y": vis._inferred_intent[1]
            }
        elif (ndim == 1 and nmsr == 2):
            # Scatterplot broken down by the dimension
            measure = vis.get_attr_by_data_model("measure")
            m1 = measure[0]
            m2 = measure[1]

            vis._inferred_intent[0].set_aggregation(None)
            vis._inferred_intent[1].set_aggregation(None)

            color_attr = vis.get_attr_by_data_model("dimension")[0]
            vis.remove_column_from_spec(color_attr)
            vis.mark = "scatter"
            auto_channel = {"x": m1, "y": m2, "color": color_attr}
        elif (ndim == 0 and nmsr == 3):
            # Scatterplot with color
            vis.mark = "scatter"
            auto_channel = {
                "x": vis._inferred_intent[0],
                "y": vis._inferred_intent[1],
                "color": vis._inferred_intent[2]
            }
        relevant_attributes = [
            auto_channel[channel].attribute for channel in auto_channel
        ]
        relevant_min_max = dict((attr, ldf.min_max[attr])
                                for attr in relevant_attributes
                                if attr != "Record" and attr in ldf.min_max)
        vis.min_max = relevant_min_max
        if (auto_channel != {}):
            vis = Compiler.enforce_specified_channel(vis, auto_channel)
            vis._inferred_intent.extend(
                filters)  # add back the preserved filters
コード例 #3
0
    def determine_encoding(ldf: LuxDataFrame, vis: Vis):
        """
        Populates Vis with the appropriate mark type and channel information based on ShowMe logic
        Currently support up to 3 dimensions or measures

        Parameters
        ----------
        ldf : lux.core.frame
                LuxDataFrame with underspecified intent
        vis : lux.vis.Vis

        Returns
        -------
        None

        Notes
        -----
        Implementing automatic encoding from Tableau's VizQL
        Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007).
        Show Me: Automatic presentation for visual analysis.
        IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144.
        https://doi.org/10.1109/TVCG.2007.70594
        """
        # Count number of measures and dimensions
        ndim = vis._ndim
        nmsr = vis._nmsr
        # preserve to add back to _inferred_intent later
        filters = utils.get_filter_specs(vis._inferred_intent)

        # Helper function (TODO: Move this into utils)
        def line_or_bar_or_geo(ldf, dimension: Clause, measure: Clause):
            dim_type = dimension.data_type
            # If no aggregation function is specified, then default as average
            if measure.aggregation == "":
                measure.set_aggregation("mean")
            if dim_type == "temporal" or dim_type == "oridinal":
                if isinstance(dimension.attribute, pd.Timestamp):
                    # If timestamp, use the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05')
                    attr = str(dimension.attribute._date_repr)
                else:
                    attr = dimension.attribute
                if ldf.cardinality[attr] == 1:
                    return "bar", {"x": measure, "y": dimension}
                else:
                    return "line", {"x": dimension, "y": measure}
            else:  # unordered categorical
                # if cardinality large than 5 then sort bars
                if ldf.cardinality[dimension.attribute] > 5:
                    dimension.sort = "ascending"
                if utils.like_geo(dimension.get_attr()):
                    return "geographical", {"x": dimension, "y": measure}
                return "bar", {"x": measure, "y": dimension}

        # ShowMe logic + additional heuristics
        # count_col = Clause( attribute="count()", data_model="measure")
        count_col = Clause(
            attribute="Record",
            aggregation="count",
            data_model="measure",
            data_type="quantitative",
        )
        auto_channel = {}
        if ndim == 0 and nmsr == 1:
            # Histogram with Count
            measure = vis.get_attr_by_data_model("measure",
                                                 exclude_record=True)[0]
            if len(vis.get_attr_by_attr_name("Record")) < 0:
                vis._inferred_intent.append(count_col)
            # If no bin specified, then default as 10
            if measure.bin_size == 0:
                measure.bin_size = 10
            auto_channel = {"x": measure, "y": count_col}
            vis._mark = "histogram"
        elif ndim == 1 and (nmsr == 0 or nmsr == 1):
            # Line or Bar Chart
            if nmsr == 0:
                vis._inferred_intent.append(count_col)
            dimension = vis.get_attr_by_data_model("dimension")[0]
            measure = vis.get_attr_by_data_model("measure")[0]
            vis._mark, auto_channel = line_or_bar_or_geo(
                ldf, dimension, measure)
        elif ndim == 2 and (nmsr == 0 or nmsr == 1):
            # Line or Bar chart broken down by the dimension
            dimensions = vis.get_attr_by_data_model("dimension")
            d1 = dimensions[0]
            d2 = dimensions[1]
            if ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]:
                # d1.channel = "color"
                vis.remove_column_from_spec(d1.attribute)
                dimension = d2
                color_attr = d1
            else:
                # if same attribute then remove_column_from_spec will remove both dims, we only want to remove one
                if d1.attribute == d2.attribute:
                    vis._inferred_intent.pop(0)
                else:
                    vis.remove_column_from_spec(d2.attribute)
                dimension = d1
                color_attr = d2
            # Colored Bar/Line chart with Count as default measure
            if not ldf.pre_aggregated:
                if nmsr == 0 and not ldf.pre_aggregated:
                    vis._inferred_intent.append(count_col)
                measure = vis.get_attr_by_data_model("measure")[0]
                vis._mark, auto_channel = line_or_bar_or_geo(
                    ldf, dimension, measure)
                auto_channel["color"] = color_attr
        elif ndim == 0 and nmsr == 2:
            # Scatterplot
            vis._mark = "scatter"
            vis._inferred_intent[0].set_aggregation(None)
            vis._inferred_intent[1].set_aggregation(None)
            auto_channel = {
                "x": vis._inferred_intent[0],
                "y": vis._inferred_intent[1]
            }
        elif ndim == 1 and nmsr == 2:
            # Scatterplot broken down by the dimension
            measure = vis.get_attr_by_data_model("measure")
            m1 = measure[0]
            m2 = measure[1]

            vis._inferred_intent[0].set_aggregation(None)
            vis._inferred_intent[1].set_aggregation(None)

            color_attr = vis.get_attr_by_data_model("dimension")[0]
            vis.remove_column_from_spec(color_attr)
            vis._mark = "scatter"
            auto_channel = {"x": m1, "y": m2, "color": color_attr}
        elif ndim == 0 and nmsr == 3:
            # Scatterplot with color
            vis._mark = "scatter"
            auto_channel = {
                "x": vis._inferred_intent[0],
                "y": vis._inferred_intent[1],
                "color": vis._inferred_intent[2],
            }
        relevant_attributes = [
            auto_channel[channel].attribute for channel in auto_channel
        ]
        relevant_min_max = dict((attr, ldf._min_max[attr])
                                for attr in relevant_attributes
                                if attr != "Record" and attr in ldf._min_max)
        # Replace scatterplot with heatmap
        HBIN_START = 5000
        if vis.mark == "scatter" and lux.config.heatmap and len(
                ldf) > HBIN_START:
            vis._postbin = True
            ldf._message.add_unique(
                f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.",
                priority=98,
            )
            vis._mark = "heatmap"
        vis._min_max = relevant_min_max
        if auto_channel != {}:
            vis = Compiler.enforce_specified_channel(vis, auto_channel)
            vis._inferred_intent.extend(
                filters)  # add back the preserved filters