Example #1
0
    def execute_filter(view: Vis):
        """
        Helper function to convert a Vis' filter specification to a SQL where clause.
        Takes in a Vis object and returns an appropriate SQL WHERE clause based on the filters specified in the vis' _inferred_intent.

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization

        Returns
        -------
        where_clause: string
            String representation of a SQL WHERE clause
        filter_vars: list of strings
            list of variables that have been used as filters
        """
        where_clause = []
        filters = utils.get_filter_specs(view._inferred_intent)
        filter_vars = []
        if filters:
            for f in range(0, len(filters)):
                if f == 0:
                    where_clause.append("WHERE")
                else:
                    where_clause.append("AND")
                curr_value = str(filters[f].value)
                curr_value = curr_value.replace("'", "''")
                where_clause.extend(
                    [
                        '"' + str(filters[f].attribute) + '"',
                        str(filters[f].filter_op),
                        "'" + curr_value + "'",
                    ]
                )
                if filters[f].attribute not in filter_vars:
                    filter_vars.append(filters[f].attribute)

        attributes = utils.get_attrs_specs(view._inferred_intent)

        # need to ensure that no null values are included in the data
        # null values breaks binning queries
        for a in attributes:
            if a.attribute != "Record":
                if where_clause == []:
                    where_clause.append("WHERE")
                else:
                    where_clause.append("AND")
                where_clause.extend(
                    [
                        '"' + str(a.attribute) + '"',
                        "IS NOT NULL",
                    ]
                )

        if where_clause == []:
            return ("", [])
        else:
            where_clause = " ".join(where_clause)
        return (where_clause, filter_vars)
Example #2
0
def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str) -> int:
    """
    Difference in bar chart/histogram shape from overall chart
    Note: this function assumes that the filtered vis.data is operating on the same range as the unfiltered vis.data.

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame
    filter_specs : list
            List of filters from the Vis
    msr_attribute : str
            The attribute name of the measure value of the chart

    Returns
    -------
    int
            Score describing how different the vis is from the overall vis
    """
    v_filter_size = get_filtered_size(filter_specs, ldf)
    v_size = len(vis.data)
    v_filter = vis.data[msr_attribute]
    total = v_filter.sum()
    v_filter = v_filter / total  # normalize by total to get ratio
    if total == 0:
        return 0
    # Generate an "Overall" Vis (TODO: This is computed multiple times for every vis, alternative is to directly access df.current_vis but we do not have guaruntee that will always be unfiltered vis (in the non-Filter action scenario))
    import copy

    unfiltered_vis = copy.copy(vis)
    # Remove filters, keep only attribute intent
    unfiltered_vis._inferred_intent = utils.get_attrs_specs(vis._inferred_intent)
    ldf.executor.execute([unfiltered_vis], ldf)

    v = unfiltered_vis.data[msr_attribute]
    v = v / v.sum()
    assert len(v) == len(v_filter), "Data for filtered and unfiltered vis have unequal length."
    sig = v_filter_size / v_size  # significance factor
    # Euclidean distance as L2 function

    rankSig = 1  # category measure value ranking significance factor
    # if the vis is a barchart, count how many categories' rank, based on measure value, changes after the filter is applied
    if vis.mark == "bar":
        dimList = vis.get_attr_by_data_model("dimension")

        # use Pandas rank function to calculate rank positions for each category
        v_rank = unfiltered_vis.data.rank()
        v_filter_rank = vis.data.rank()
        # go through and count the number of ranking changes between the filtered and unfiltered data
        numCategories = ldf.cardinality[dimList[0].attribute]
        for r in range(0, numCategories - 1):
            if v_rank[msr_attribute][r] != v_filter_rank[msr_attribute][r]:
                rankSig += 1
        # normalize ranking significance factor
        rankSig = rankSig / numCategories

    from scipy.spatial.distance import euclidean

    return sig * rankSig * euclidean(v, v_filter)
Example #3
0
    def intent_to_JSON(intent):
        from lux.utils import utils

        filter_specs = utils.get_filter_specs(intent)
        attrs_specs = utils.get_attrs_specs(intent)

        intent = {}
        intent["attributes"] = [clause.attribute for clause in attrs_specs]
        intent["filters"] = [clause.attribute for clause in filter_specs]
        return intent
Example #4
0
    def context_to_JSON(context):
        from lux.utils import utils

        filter_specs = utils.get_filter_specs(context)
        attrs_specs = utils.get_attrs_specs(context)

        specs = {}
        specs['attributes'] = [spec.attribute for spec in attrs_specs]
        specs['filters'] = [spec.attribute for spec in filter_specs]
        return specs
Example #5
0
    def create_where_clause(filter_specs, view=""):
        where_clause = []
        filter_vars = []
        filters = filter_specs
        if filters:
            for f in range(0, len(filters)):
                if f == 0:
                    where_clause.append("WHERE")
                else:
                    where_clause.append("AND")
                curr_value = str(filters[f].value)
                curr_value = curr_value.replace("'", "''")
                where_clause.extend(
                    [
                        '"' + str(filters[f].attribute) + '"',
                        str(filters[f].filter_op),
                        "'" + curr_value + "'",
                    ]
                )
                if filters[f].attribute not in filter_vars:
                    filter_vars.append(filters[f].attribute)
        if view != "":
            attributes = utils.get_attrs_specs(view._inferred_intent)

            # need to ensure that no null values are included in the data
            # null values breaks binning queries
            for a in attributes:
                if a.attribute != "Record":
                    if where_clause == []:
                        where_clause.append("WHERE")
                    else:
                        where_clause.append("AND")
                    where_clause.extend(
                        [
                            '"' + str(a.attribute) + '"',
                            "IS NOT NULL",
                        ]
                    )

        if where_clause == []:
            return ("", [])
        else:
            where_clause = " ".join(where_clause)
        return (where_clause, filter_vars)
Example #6
0
def deviation_from_overall(view: View, ldf: LuxDataFrame, filter_specs: list,
                           msr_attribute: str) -> int:
    """
	Difference in bar chart/histogram shape from overall chart
	Note: this function assumes that the filtered view.data is operating on the same range as the unfiltered view.data. 

	Parameters
	----------
	view : View
	ldf : LuxDataFrame
	filter_specs : list
		List of filters from the View
	msr_attribute : str
		The attribute name of the measure value of the chart

	Returns
	-------
	int
		Score describing how different the view is from the overall view
	"""
    v_filter_size = get_filtered_size(filter_specs, ldf)
    v_size = len(view.data)
    v_filter = view.data[msr_attribute]
    v_filter = v_filter / v_filter.sum()  # normalize by total to get ratio

    # Generate an "Overall" View (TODO: This is computed multiple times for every view, alternative is to directly access df.current_view but we do not have guaruntee that will always be unfiltered view (in the non-Filter action scenario))
    import copy
    unfiltered_view = copy.copy(view)
    unfiltered_view.spec_lst = utils.get_attrs_specs(
        view.spec_lst)  # Remove filters, keep only attribute specs
    ldf.executor.execute([unfiltered_view], ldf)

    v = unfiltered_view.data[msr_attribute]
    v = v / v.sum()
    assert len(v) == len(
        v_filter), "Data for filtered and unfiltered view have unequal length."
    sig = v_filter_size / v_size  #significance factor
    # Euclidean distance as L2 function
    from scipy.spatial.distance import euclidean
    return sig * euclidean(v, v_filter)
Example #7
0
def filter(ldf):
    """
    Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data.

    Parameters
    ----------
    ldf : lux.core.frame
            LuxDataFrame with underspecified intent.

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Filter action.
    """
    filters = utils.get_filter_specs(ldf._intent)
    filter_values = []
    output = []
    # if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical variable
    column_spec = utils.get_attrs_specs(ldf.current_vis[0]._inferred_intent)
    column_spec_attr = map(lambda x: x.attribute, column_spec)
    if len(filters) == 1:
        # get unique values for all categorical values specified and creates corresponding filters
        fltr = filters[0]

        if ldf.data_type_lookup[fltr.attribute] == "nominal":
            recommendation = {
                "action":
                "Filter",
                "description":
                f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value.",
            }
            unique_values = ldf.unique_values[fltr.attribute]
            filter_values.append(fltr.value)
            # creates vis with new filters
            for val in unique_values:
                if val not in filter_values:
                    new_spec = column_spec.copy()
                    new_filter = lux.Clause(attribute=fltr.attribute,
                                            value=val)
                    new_spec.append(new_filter)
                    temp_vis = Vis(new_spec)
                    output.append(temp_vis)
        elif ldf.data_type_lookup[fltr.attribute] == "quantitative":
            recommendation = {
                "action":
                "Filter",
                "description":
                f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative inequality operation.",
            }

            def get_complementary_ops(fltr_op):
                if fltr_op == ">":
                    return "<="
                elif fltr_op == "<":
                    return ">="
                elif fltr_op == ">=":
                    return "<"
                elif fltr_op == "<=":
                    return ">"
                # TODO: need to support case where fltr_op is "=" --> auto-binned ranges

            # Create vis with complementary filter operations
            new_spec = column_spec.copy()
            new_filter = lux.Clause(
                attribute=fltr.attribute,
                filter_op=get_complementary_ops(fltr.filter_op),
                value=fltr.value,
            )
            new_spec.append(new_filter)
            temp_vis = Vis(new_spec, score=1)
            output.append(temp_vis)
    # if no existing filters, create filters using unique values from all categorical variables in the dataset
    else:
        intended_attrs = ", ".join([
            clause.attribute for clause in ldf._intent
            if clause.value == "" and clause.attribute != "Record"
        ])
        recommendation = {
            "action":
            "Filter",
            "description":
            f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent.",
        }
        categorical_vars = []
        for col in list(ldf.columns):
            # if cardinality is not too high, and attribute is not one of the X,Y (specified) column
            if ldf.cardinality[col] < 30 and col not in column_spec_attr:
                categorical_vars.append(col)
        for cat in categorical_vars:
            unique_values = ldf.unique_values[cat]
            for i in range(0, len(unique_values)):
                new_spec = column_spec.copy()
                new_filter = lux.Clause(attribute=cat,
                                        filter_op="=",
                                        value=unique_values[i])
                new_spec.append(new_filter)
                temp_vis = Vis(new_spec)
                output.append(temp_vis)
    vlist = lux.vis.VisList.VisList(output, ldf)
    for vis in vlist:
        vis.score = interestingness(vis, ldf)
    vlist = vlist.topK(15)
    recommendation["collection"] = vlist
    return recommendation
Example #8
0
def interestingness(vis:Vis ,ldf:LuxDataFrame) -> int:
	"""
	Compute the interestingness score of the vis.
	The interestingness metric is dependent on the vis type.

	Parameters
	----------
	vis : Vis
	ldf : LuxDataFrame

	Returns
	-------
	int
		Interestingness Score
	"""	
	

	if vis.data is None or len(vis.data)==0:
		raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")

	n_dim = 0
	n_msr = 0
	
	filter_specs = utils.get_filter_specs(vis._inferred_intent)
	vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)

	record_attrs = list(filter(lambda x: x.attribute=="Record" and x.data_model=="measure", vis_attrs_specs))
	n_record = len(record_attrs)
	for clause in vis_attrs_specs:
		if (clause.attribute!="Record"):
			if (clause.data_model == 'dimension'):
				n_dim += 1
			if (clause.data_model == 'measure'):
				n_msr += 1
	n_filter = len(filter_specs)
	attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
	dimension_lst = vis.get_attr_by_data_model("dimension")
	measure_lst = vis.get_attr_by_data_model("measure")
	v_size = len(vis.data)
	# Line/Bar Chart
	#print("r:", n_record, "m:", n_msr, "d:",n_dim)
	if (n_dim == 1 and (n_msr==0 or n_msr==1)):
		if (v_size<2): return -1 
		if (n_filter == 0):
			return unevenness(vis, ldf, measure_lst, dimension_lst)
		elif(n_filter==1):
			return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
	# Histogram
	elif (n_dim == 0 and n_msr == 1):
		if (v_size<2): return -1 
		if (n_filter == 0):
			v = vis.data["Number of Records"]
			return skewness(v)
		elif (n_filter == 1):
			return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
	# Scatter Plot
	elif (n_dim == 0 and n_msr == 2):
		if (v_size<2): return -1 
		if (n_filter==1):
			v_filter_size = get_filtered_size(filter_specs, vis.data)
			sig = v_filter_size/v_size
		else:
			sig = 1
		return sig * monotonicity(vis,attr_specs)
	# Scatterplot colored by Dimension
	elif (n_dim == 1 and n_msr == 2):
		if (v_size<5): return -1 
		color_attr = vis.get_attr_by_channel("color")[0].attribute
		
		C = ldf.cardinality[color_attr]
		if (C<40):
			return 1/C
		else:
			return -1
	# Scatterplot colored by dimension
	elif (n_dim== 1 and n_msr == 2):
		return 0.2
	# Scatterplot colored by measure
	elif (n_msr == 3):
		return 0.1	
	# colored line and barchart cases
	elif ((vis.mark == "line" or vis.mark == "bar") and n_dim == 2):
		return 0.2
	# Default
	else:
		return -1
Example #9
0
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
    """
    Compute the interestingness score of the vis.
    The interestingness metric is dependent on the vis type.

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame

    Returns
    -------
    int
            Interestingness Score
    """

    if vis.data is None or len(vis.data) == 0:
        return -1
        # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")

    n_dim = 0
    n_msr = 0

    filter_specs = utils.get_filter_specs(vis._inferred_intent)
    vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)

    record_attrs = list(
        filter(
            lambda x: x.attribute == "Record" and x.data_model == "measure",
            vis_attrs_specs,
        ))
    n_record = len(record_attrs)
    for clause in vis_attrs_specs:
        if clause.attribute != "Record":
            if clause.data_model == "dimension":
                n_dim += 1
            if clause.data_model == "measure":
                n_msr += 1
    n_filter = len(filter_specs)
    attr_specs = [
        clause for clause in vis_attrs_specs if clause.attribute != "Record"
    ]
    dimension_lst = vis.get_attr_by_data_model("dimension")
    measure_lst = vis.get_attr_by_data_model("measure")
    v_size = len(vis.data)
    # Line/Bar Chart
    # print("r:", n_record, "m:", n_msr, "d:",n_dim)
    if n_dim == 1 and (n_msr == 0 or n_msr == 1):
        if v_size < 2:
            return -1
        if n_filter == 0:
            return unevenness(vis, ldf, measure_lst, dimension_lst)
        elif n_filter == 1:
            return deviation_from_overall(vis, ldf, filter_specs,
                                          measure_lst[0].attribute)
    # Histogram
    elif n_dim == 0 and n_msr == 1:
        if v_size < 2:
            return -1
        if n_filter == 0 and "Number of Records" in vis.data:
            if "Number of Records" in vis.data:
                v = vis.data["Number of Records"]
                return skewness(v)
        elif n_filter == 1 and "Number of Records" in vis.data:
            return deviation_from_overall(vis, ldf, filter_specs,
                                          "Number of Records")
        return -1
    # Scatter Plot
    elif n_dim == 0 and n_msr == 2:
        if v_size < 10:
            return -1
        if vis.mark == "heatmap":
            return weighted_correlation(vis.data["xBinStart"],
                                        vis.data["yBinStart"],
                                        vis.data["count"])
        if n_filter == 1:
            v_filter_size = get_filtered_size(filter_specs, vis.data)
            sig = v_filter_size / v_size
        else:
            sig = 1
        return sig * monotonicity(vis, attr_specs)
    # Scatterplot colored by Dimension
    elif n_dim == 1 and n_msr == 2:
        if v_size < 10:
            return -1
        color_attr = vis.get_attr_by_channel("color")[0].attribute

        C = ldf.cardinality[color_attr]
        if C < 40:
            return 1 / C
        else:
            return -1
    # Scatterplot colored by dimension
    elif n_dim == 1 and n_msr == 2:
        return 0.2
    # Scatterplot colored by measure
    elif n_msr == 3:
        return 0.1
    # colored line and barchart cases
    elif vis.mark == "line" and n_dim == 2:
        return 0.15
    # for colored bar chart, scoring based on Chi-square test for independence score.
    # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
    elif vis.mark == "bar" and n_dim == 2:
        from scipy.stats import chi2_contingency

        measure_column = vis.get_attr_by_data_model("measure")[0].attribute
        dimension_columns = vis.get_attr_by_data_model("dimension")

        groupby_column = dimension_columns[0].attribute
        color_column = dimension_columns[1].attribute

        contingency_table = []
        groupby_cardinality = ldf.cardinality[groupby_column]
        groupby_unique_vals = ldf.unique_values[groupby_column]
        for c in range(0, groupby_cardinality):
            contingency_table.append(
                vis.data[vis.data[groupby_column] ==
                         groupby_unique_vals[c]][measure_column])
        score = 0.12
        # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in
        # a category having no counts

        try:
            color_cardinality = ldf.cardinality[color_column]
            # scale down score based on number of categories
            chi2_score = chi2_contingency(contingency_table)[0] * 0.9**(
                color_cardinality + groupby_cardinality)
            score = min(0.10, chi2_score)
        except ValueError:
            pass
        return score
    # Default
    else:
        return -1
Example #10
0
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
    """
    Compute the interestingness score of the vis.
    The interestingness metric is dependent on the vis type.

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame

    Returns
    -------
    int
            Interestingness Score
    """

    if vis.data is None or len(vis.data) == 0:
        return -1
        # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")
    try:
        filter_specs = utils.get_filter_specs(vis._inferred_intent)
        vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)
        n_dim = vis._ndim
        n_msr = vis._nmsr
        n_filter = len(filter_specs)
        attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
        dimension_lst = vis.get_attr_by_data_model("dimension")
        measure_lst = vis.get_attr_by_data_model("measure")
        v_size = len(vis.data)

        if (
            n_dim == 1
            and (n_msr == 0 or n_msr == 1)
            and ldf.current_vis is not None
            and vis.get_attr_by_channel("y")[0].data_type == "quantitative"
            and len(ldf.current_vis) == 1
            and ldf.current_vis[0].mark == "line"
            and len(get_filter_specs(ldf.intent)) > 0
        ):
            query_vc = VisList(ldf.current_vis, ldf)
            query_vis = query_vc[0]
            preprocess(query_vis)
            preprocess(vis)
            return 1 - euclidean_dist(query_vis, vis)

        # Line/Bar Chart
        # print("r:", n_record, "m:", n_msr, "d:",n_dim)
        if n_dim == 1 and (n_msr == 0 or n_msr == 1):
            if v_size < 2:
                return -1

            if n_filter == 0:
                return unevenness(vis, ldf, measure_lst, dimension_lst)
            elif n_filter == 1:
                return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
        # Histogram
        elif n_dim == 0 and n_msr == 1:
            if v_size < 2:
                return -1
            if n_filter == 0 and "Number of Records" in vis.data:
                if "Number of Records" in vis.data:
                    v = vis.data["Number of Records"]
                    return skewness(v)
            elif n_filter == 1 and "Number of Records" in vis.data:
                return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
            return -1
        # Scatter Plot
        elif n_dim == 0 and n_msr == 2:
            if v_size < 10:
                return -1
            if vis.mark == "heatmap":
                return weighted_correlation(
                    vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]
                )
            if n_filter == 1:
                v_filter_size = get_filtered_size(filter_specs, vis.data)
                sig = v_filter_size / v_size
            else:
                sig = 1
            return sig * monotonicity(vis, attr_specs)
        # Scatterplot colored by Dimension
        elif n_dim == 1 and n_msr == 2:
            if v_size < 10:
                return -1
            color_attr = vis.get_attr_by_channel("color")[0].attribute

            C = ldf.cardinality[color_attr]
            if C < 40:
                return 1 / C
            else:
                return -1
        # Scatterplot colored by dimension
        elif n_dim == 1 and n_msr == 2:
            return 0.2
        # Scatterplot colored by measure
        elif n_msr == 3:
            return 0.1
        # colored line and barchart cases
        elif vis.mark == "line" and n_dim == 2:
            return 0.15
        # for colored bar chart, scoring based on Chi-square test for independence score.
        # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
        elif vis.mark == "bar" and n_dim == 2:
            from scipy.stats import chi2_contingency

            measure_column = vis.get_attr_by_data_model("measure")[0].attribute
            dimension_columns = vis.get_attr_by_data_model("dimension")

            groupby_column = dimension_columns[0].attribute
            color_column = dimension_columns[1].attribute

            contingency_tbl = pd.crosstab(
                vis.data[groupby_column],
                vis.data[color_column],
                values=vis.data[measure_column],
                aggfunc=sum,
            )

            try:
                color_cardinality = ldf.cardinality[color_column]
                groupby_cardinality = ldf.cardinality[groupby_column]
                # scale down score based on number of categories
                chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** (
                    color_cardinality + groupby_cardinality
                )
                score = min(0.10, chi2_score)
            except (ValueError, KeyError):
                # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts
                score = -1
            return score
        # Default
        else:
            return -1
    except:
        if lux.config.interestingness_fallback:
            # Supress interestingness related issues
            warnings.warn(f"An error occurred when computing interestingness for: {vis}")
            return -1
        else:
            raise
Example #11
0
def add_filter(ldf):
    """
    Iterates over all possible values of a categorical variable and generates visualizations where each categorical
    value filters the data.

    Parameters
    ----------
    ldf : lux.core.frame
            LuxDataFrame with underspecified intent.

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Filter action.
    """
    filters = utils.get_filter_specs(ldf._intent)
    filter_values = []
    output = []
    # if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical
    # variable
    column_spec = utils.get_attrs_specs(ldf.current_vis[0].intent)
    column_spec_attr = list(map(lambda x: x.attribute, column_spec))
    if len(filters) == 1:
        # get unique values for all categorical values specified and creates corresponding filters
        fltr = filters[0]

        if ldf.data_type[fltr.attribute] == "nominal":
            recommendation = {
                "action": "Filter",
                "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an "
                               f"alternative value.",
                "long_description": f"Swap out the filter value for {fltr.attribute} to other possible values, while "
                                    f"keeping all else the same. Visualizations are ranked based on interestingness",
            }
            unique_values = ldf.unique_values[fltr.attribute]
            filter_values.append(fltr.value)
            # creates vis with new filters
            for val in unique_values:
                if val not in filter_values:
                    new_spec = column_spec.copy()
                    new_filter = lux.Clause(attribute=fltr.attribute, value=val)
                    new_spec.append(new_filter)
                    temp_vis = Vis(new_spec)
                    output.append(temp_vis)
        elif ldf.data_type[fltr.attribute] == "quantitative":
            recommendation = {
                "action": "Filter",
                "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an "
                               f"alternative inequality operation.",
                "long_description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an "
                                    f"alternative inequality operation.",
            }

            # Create vis with complementary filter operations
            # NOTE: This section of code has been modified to allow for the rendering of multiple vis
            for op in get_complementary_ops(fltr.filter_op):
                new_spec = column_spec.copy()
                new_filter = lux.Clause(
                    attribute=fltr.attribute,
                    filter_op=op,
                    value=fltr.value,
                )
                new_spec.append(new_filter)
                temp_vis = Vis(new_spec, score=1)
                output.append(temp_vis)

    # if no existing filters, create filters using unique values from all categorical variables in the dataset
    else:
        intended_attrs = ", ".join(
            [
                str(clause.attribute)
                for clause in ldf._intent
                if clause.value == "" and clause.attribute != "Record"
            ]
        )
        recommendation = {
            "action": "Filter",
            "description": f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent.",
            "long_description": f"Adding any filter while keeping the attributes on the x and y axes fixed. "
                                f"Visualizations are ranked based on interestingness",
        }
        categorical_vars = []
        for col in list(ldf.columns):
            # if cardinality is not too high, and attribute is not one of the X,Y (specified) column
            if 1 < ldf.cardinality[col] < 30 and col not in column_spec_attr:
                categorical_vars.append(col)
        for cat in categorical_vars:
            unique_values = ldf.unique_values[cat]
            for val in unique_values:
                new_spec = column_spec.copy()
                new_filter = lux.Clause(attribute=cat, filter_op="=", value=val)
                new_spec.append(new_filter)
                temp_vis = Vis(new_spec)
                output.append(temp_vis)
    if (
        ldf.current_vis is not None
        and len(ldf.current_vis) == 1
        and ldf.current_vis[0].mark == "line"
        and len(get_filter_specs(ldf.intent)) > 0
    ):
        recommendation = {
            "action": "Similarity",
            "description": "Show other charts that are visually similar to the Current vis.",
            "long_description": "Show other charts that are visually similar to the Current vis.",
        }
        last = get_filter_specs(ldf.intent)[-1]
        output = ldf.intent.copy()[0:-1]
        # array of possible values for attribute
        arr = ldf[last.attribute].unique().tolist()
        output.append(lux.Clause(last.attribute, last.attribute, arr))
    vlist = lux.vis.VisList.VisList(output, ldf)
    vlist_copy = lux.vis.VisList.VisList(output, ldf)
    for i in range(len(vlist_copy)):
        vlist[i].score = interestingness(vlist_copy[i], ldf)
    vlist.sort()
    vlist = vlist.showK()
    if recommendation["action"] == "Similarity":
        recommendation["collection"] = vlist[1:]
    else:
        recommendation["collection"] = vlist
    return recommendation
Example #12
0
def interestingness(view: View, ldf: LuxDataFrame) -> int:
    """
	Compute the interestingness score of the view.
	The interestingness metric is dependent on the view type.

	Parameters
	----------
	view : View
	ldf : LuxDataFrame

	Returns
	-------
	int
		Interestingness Score
	"""

    if view.data is None:
        raise Exception(
            "View.data needs to be populated before interestingness can be computed. Run Executor.execute(view,ldf)."
        )

    n_dim = 0
    n_msr = 0

    filter_specs = utils.get_filter_specs(view.spec_lst)
    view_attrs_specs = utils.get_attrs_specs(view.spec_lst)

    for spec in view_attrs_specs:
        if (spec.attribute != "Record"):
            if (spec.data_model == 'dimension'):
                n_dim += 1
            if (spec.data_model == 'measure'):
                n_msr += 1
    n_filter = len(filter_specs)
    attr_specs = [
        spec for spec in view_attrs_specs if spec.attribute != "Record"
    ]
    dimension_lst = view.get_attr_by_data_model("dimension")
    measure_lst = view.get_attr_by_data_model("measure")

    # Bar Chart
    if (n_dim == 1 and (n_msr == 0 or n_msr == 1)):
        if (n_filter == 0):
            return unevenness(view, ldf, measure_lst, dimension_lst)
        elif (n_filter == 1):
            return deviation_from_overall(view, ldf, filter_specs,
                                          measure_lst[0].attribute)
    # Histogram
    elif (n_dim == 0 and n_msr == 1):
        if (n_filter == 0):
            v = view.data["Count of Records"]
            return skewness(v)
        elif (n_filter == 1):
            return deviation_from_overall(view, ldf, filter_specs,
                                          "Count of Records")
    # Scatter Plot
    elif (n_dim == 0 and n_msr == 2):
        if (n_filter == 1):
            v_filter_size = get_filtered_size(filter_specs, view.data)
            v_size = len(view.data)
            sig = v_filter_size / v_size
        else:
            sig = 1
        return sig * monotonicity(view, attr_specs)
    # Scatterplot colored by Dimension
    elif (n_dim == 1 and n_msr == 2):
        color_attr = view.get_attr_by_channel("color")[0].attribute

        C = ldf.cardinality[color_attr]
        if (C < 40):
            return 1 / C
        else:
            return -1
    # Scatterplot colored by dimension
    elif (n_dim == 1 and n_msr == 2):
        return 0.2
    # Scatterplot colored by measure
    elif (n_msr == 3):
        return 0.1
    # Default
    else:
        return -1
Example #13
0
def filter(ldf):
	#for benchmarking
	if ldf.toggle_benchmarking == True:
		tic = time.perf_counter()
	'''
	Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data.

	Parameters
	----------
	ldf : lux.luxDataFrame.LuxDataFrame
		LuxDataFrame with underspecified intent.

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Filter action.
	'''
	
	filters = utils.get_filter_specs(ldf.intent)
	filter_values = []
	output = []
	#if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical variable
	column_spec = utils.get_attrs_specs(ldf.current_vis[0]._inferred_intent)
	column_spec_attr = map(lambda x: x.attribute,column_spec)
	if len(filters) == 1:
		#get unique values for all categorical values specified and creates corresponding filters
		fltr = filters[0]
		unique_values = ldf.unique_values[fltr.attribute]
		filter_values.append(fltr.value)
		#creates views with new filters
		for val in unique_values:
			if val not in filter_values:
				new_spec = column_spec.copy()
				new_filter = lux.Clause(attribute = fltr.attribute, value = val)
				new_spec.append(new_filter)
				temp_view = Vis(new_spec)
				output.append(temp_view)
		recommendation = {"action":"Filter",
					 	  "description":f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value."}
	else:	#if no existing filters, create filters using unique values from all categorical variables in the dataset
		intended_attrs = '<b>'+', '.join([clause.attribute for clause in ldf.intent if clause.value=='' and clause.attribute!="Record"])+'</b>'
		recommendation = {"action":"Filter",
					 "description":f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent."}
		categorical_vars = []
		for col in list(ldf.columns):
			# if cardinality is not too high, and attribute is not one of the X,Y (specified) column
			if ldf.cardinality[col]<30 and col not in column_spec_attr:
				categorical_vars.append(col)
		for cat in categorical_vars:
			unique_values = ldf.unique_values[cat]
			for i in range(0, len(unique_values)):
				new_spec = column_spec.copy()
				new_filter = lux.Clause(attribute=cat, filter_op="=",value=unique_values[i])
				new_spec.append(new_filter)
				temp_view = Vis(new_spec)
				output.append(temp_view)
	vc = lux.vis.VisList.VisList(output,ldf)
	for view in vc:
		view.score = interestingness(view,ldf)
	vc = vc.topK(15)
	recommendation["collection"] = vc
	
	#for benchmarking
	if ldf.toggle_benchmarking == True:
		toc = time.perf_counter()
		print(f"Performed filter action in {toc - tic:0.4f} seconds")
	return recommendation
Example #14
0
def filter(ldf):
    #for benchmarking
    if ldf.toggle_benchmarking == True:
        tic = time.perf_counter()
    '''
	Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data.

	Parameters
	----------
	ldf : lux.luxDataFrame.LuxDataFrame
		LuxDataFrame with underspecified context.

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Filter action.
	'''
    recommendation = {
        "action":
        "Filter",
        "description":
        "Shows possible visualizations when filtered by categorical variables in the dataset."
    }
    filters = utils.get_filter_specs(ldf.context)
    filter_values = []
    output = []
    #if Row is specified, create visualizations where data is filtered by all values of the Row's categorical variable
    column_spec = utils.get_attrs_specs(ldf.current_view[0].spec_lst)
    column_spec_attr = map(lambda x: x.attribute, column_spec)
    if len(filters) > 0:
        #get unique values for all categorical values specified and creates corresponding filters
        for row in filters:
            unique_values = ldf.unique_values[row.attribute]
            filter_values.append(row.value)
            #creates views with new filters
            for val in unique_values:
                if val not in filter_values:
                    new_spec = column_spec.copy()
                    new_filter = lux.Spec(attribute=row.attribute, value=val)
                    new_spec.append(new_filter)
                    temp_view = View(new_spec)
                    output.append(temp_view)
    else:  #if no existing filters, create filters using unique values from all categorical variables in the dataset
        categorical_vars = []
        for col in list(ldf.columns):
            # if cardinality is not too high, and attribute is not one of the X,Y (specified) column
            if ldf.cardinality[col] < 40 and col not in column_spec_attr:
                categorical_vars.append(col)
        for cat in categorical_vars:
            unique_values = ldf.unique_values[cat]
            for i in range(0, len(unique_values)):
                new_spec = column_spec.copy()
                new_filter = lux.Spec(attribute=cat,
                                      filter_op="=",
                                      value=unique_values[i])
                new_spec.append(new_filter)
                temp_view = View(new_spec)
                output.append(temp_view)
    vc = lux.view.ViewCollection.ViewCollection(output)
    vc = vc.load(ldf)
    for view in vc:
        view.score = interestingness(view, ldf)
    vc = vc.topK(15)
    recommendation["collection"] = vc

    #for benchmarking
    if ldf.toggle_benchmarking == True:
        toc = time.perf_counter()
        print(f"Performed filter action in {toc - tic:0.4f} seconds")
    return recommendation