Exemple #1
0
    def execute_filter(vis: Vis) -> bool:
        """
        Apply a Vis's filter to vis.data

        Parameters
        ----------
        vis : Vis

        Returns
        -------
        bool
            Boolean flag indicating if any filter was applied
        """
        assert (
            vis.data is not None
        ), "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)"
        filters = utils.get_filter_specs(vis._inferred_intent)

        if filters:
            # TODO: Need to handle OR logic
            for filter in filters:
                vis._vis_data = PandasExecutor.apply_filter(
                    vis.data, filter.attribute, filter.filter_op, filter.value
                )
            return True
        else:
            return False
Exemple #2
0
def enhance(ldf):
    """
    Given a set of vis, generates possible visualizations when an additional attribute is added to the current vis.

    Parameters
    ----------
    ldf : lux.core.frame
            LuxDataFrame with underspecified intent.

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Enhance action.
    """

    filters = utils.get_filter_specs(ldf._intent)
    # Collect variables that already exist in the intent
    attr_specs = list(
        filter(lambda x: x.value == "" and x.attribute != "Record",
               ldf._intent))
    fltr_str = [
        fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters
    ]
    attr_str = [str(clause.attribute) for clause in attr_specs]
    intended_attrs = f'<p class="highlight-intent">{", ".join(attr_str + fltr_str)}</p>'
    if len(attr_specs) == 1:
        recommendation = {
            "action":
            "Enhance",
            "description":
            f"Augmenting current {intended_attrs} intent with additional attribute.",
        }
    elif len(attr_specs) == 2:
        recommendation = {
            "action":
            "Enhance",
            "description":
            f"Further breaking down current {intended_attrs} intent by additional attribute.",
        }
    # if there are too many column attributes, return don't generate Enhance recommendations
    elif len(attr_specs) > 2:
        recommendation = {"action": "Enhance"}
        recommendation["collection"] = []
        return recommendation
    intent = ldf._intent.copy()
    # Clear channel so that channel not enforced based on input vis intent
    for clause in intent:
        clause.channel = ""
    intent = filters + attr_specs
    intent.append("?")
    vlist = lux.vis.VisList.VisList(intent, ldf)

    # Then use the data populated in the vis list to compute score
    for vis in vlist:
        vis.score = interestingness(vis, ldf)

    vlist.sort()
    vlist = vlist.showK()
    recommendation["collection"] = vlist
    return recommendation
Exemple #3
0
    def execute_filter(view: Vis):
        """
        Helper function to convert a Vis' filter specification to a SQL where clause.
        Takes in a Vis object and returns an appropriate SQL WHERE clause based on the filters specified in the vis' _inferred_intent.

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization

        Returns
        -------
        where_clause: string
            String representation of a SQL WHERE clause
        filter_vars: list of strings
            list of variables that have been used as filters
        """
        where_clause = []
        filters = utils.get_filter_specs(view._inferred_intent)
        filter_vars = []
        if filters:
            for f in range(0, len(filters)):
                if f == 0:
                    where_clause.append("WHERE")
                else:
                    where_clause.append("AND")
                curr_value = str(filters[f].value)
                curr_value = curr_value.replace("'", "''")
                where_clause.extend(
                    [
                        '"' + str(filters[f].attribute) + '"',
                        str(filters[f].filter_op),
                        "'" + curr_value + "'",
                    ]
                )
                if filters[f].attribute not in filter_vars:
                    filter_vars.append(filters[f].attribute)

        attributes = utils.get_attrs_specs(view._inferred_intent)

        # need to ensure that no null values are included in the data
        # null values breaks binning queries
        for a in attributes:
            if a.attribute != "Record":
                if where_clause == []:
                    where_clause.append("WHERE")
                else:
                    where_clause.append("AND")
                where_clause.extend(
                    [
                        '"' + str(a.attribute) + '"',
                        "IS NOT NULL",
                    ]
                )

        if where_clause == []:
            return ("", [])
        else:
            where_clause = " ".join(where_clause)
        return (where_clause, filter_vars)
Exemple #4
0
    def execute_filter(view: View):
        assert view.data is not None, "execute_filter assumes input view.data is populated (if not, populate with LuxDataFrame values)"
        filters = utils.get_filter_specs(view.spec_lst)

        if (filters):
            # TODO: Need to handle OR logic
            for filter in filters:
                view.data = PandasExecutor.apply_filter(
                    view.data, filter.attribute, filter.filter_op,
                    filter.value)
Exemple #5
0
    def context_to_JSON(context):
        from lux.utils import utils

        filter_specs = utils.get_filter_specs(context)
        attrs_specs = utils.get_attrs_specs(context)

        specs = {}
        specs['attributes'] = [spec.attribute for spec in attrs_specs]
        specs['filters'] = [spec.attribute for spec in filter_specs]
        return specs
Exemple #6
0
    def intent_to_JSON(intent):
        from lux.utils import utils

        filter_specs = utils.get_filter_specs(intent)
        attrs_specs = utils.get_attrs_specs(intent)

        intent = {}
        intent["attributes"] = [clause.attribute for clause in attrs_specs]
        intent["filters"] = [clause.attribute for clause in filter_specs]
        return intent
 def execute_filter(vis: Vis):
     assert vis.data is not None, "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)"
     filters = utils.get_filter_specs(vis._inferred_intent)
     
     if (filters):
         # TODO: Need to handle OR logic
         for filter in filters:
             vis._vis_data = PandasExecutor.apply_filter(vis.data, filter.attribute, filter.filter_op, filter.value)
         return True
     else:
         return False
Exemple #8
0
def test_filter_inequality():
	df = pd.read_csv("lux/data/car.csv")
	df["Year"] = pd.to_datetime(df["Year"], format='%Y')

	df.set_intent([lux.Clause(attribute = "Horsepower"),lux.Clause(attribute = "MilesPerGal"),lux.Clause(attribute = "Acceleration", filter_op=">",value = 10)])
	df._repr_html_()

	from lux.utils.utils import get_filter_specs
	complement_vis = df.recommendation["Filter"][0]
	fltr_clause = get_filter_specs(complement_vis._intent)[0]
	assert fltr_clause.filter_op =="<=" 
	assert fltr_clause.value ==10
Exemple #9
0
def enhance(ldf):
    #for benchmarking
    if ldf.toggle_benchmarking == True:
        tic = time.perf_counter()
    '''
	Given a set of views, generates possible visualizations when an additional attribute is added to the current view.

	Parameters
	----------
	ldf : lux.luxDataFrame.LuxDataFrame
		LuxDataFrame with underspecified context.

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Enhance action.
	'''
    recommendation = {
        "action":
        "Enhance",
        "description":
        "Shows possible visualizations when an additional attribute is added to the current view."
    }
    filters = utils.get_filter_specs(ldf.context)
    # Collect variables that already exist in the context
    attr_specs = list(
        filter(lambda x: x.value == "" and x.attribute != "Record",
               ldf.context))
    if (
            len(attr_specs) > 2
    ):  # if there are too many column attributes, return don't generate Enhance recommendations
        recommendation["collection"] = []
        return recommendation
    query = ldf.context.copy()
    query = filters + attr_specs
    query.append("?")
    vc = lux.view.ViewCollection.ViewCollection(query)
    vc = vc.load(ldf)

    # Then use the data populated in the view collection to compute score
    for view in vc:
        view.score = interestingness(view, ldf)

    vc = vc.topK(15)
    recommendation["collection"] = vc
    #for benchmarking
    if ldf.toggle_benchmarking == True:
        toc = time.perf_counter()
        print(f"Performed enhance action in {toc - tic:0.4f} seconds")
    return recommendation
Exemple #10
0
def test_filter_inequality(global_var):
    df = pytest.car_df
    df["Year"] = pd.to_datetime(df["Year"], format="%Y")

    df.set_intent([
        lux.Clause(attribute="Horsepower"),
        lux.Clause(attribute="MilesPerGal"),
        lux.Clause(attribute="Acceleration", filter_op=">", value=10),
    ])
    df._ipython_display_()

    from lux.utils.utils import get_filter_specs

    complement_vis = df.recommendation["Filter"][0]
    fltr_clause = get_filter_specs(complement_vis._intent)[0]
    assert fltr_clause.filter_op == "<="
    assert fltr_clause.value == 10
Exemple #11
0
    def execute_filter(view: Vis):
        """
        Helper function to convert a Vis' filter specification to a SQL where clause.
        Takes in a Vis object and returns an appropriate SQL WHERE clause based on the filters specified in the vis' _inferred_intent.

        Parameters
        ----------
        vis: lux.Vis
            lux.Vis object that represents a visualization

        Returns
        -------
        where_clause: string
            String representation of a SQL WHERE clause
        filter_vars: list of strings
            list of variables that have been used as filters
        """
        filters = utils.get_filter_specs(view._inferred_intent)
        return SQLExecutor.create_where_clause(filters, view=view)
Exemple #12
0
 def execute(view_collection: ViewCollection, ldf: LuxDataFrame):
     import pandas as pd
     '''
     Given a ViewCollection, fetch the data required to render the view
     1) Apply filters
     2) Retreive relevant attribute
     3) return a DataFrame with relevant results
     '''
     for view in view_collection:
         print(view, utils.get_filter_specs(view.spec_lst))
         # Select relevant data based on attribute information
         attributes = set([])
         for spec in view.spec_lst:
             if (spec.attribute):
                 if (spec.attribute == "Record"):
                     attributes.add(spec.attribute)
                 #else:
                 attributes.add(spec.attribute)
         if view.mark not in ["bar", "line", "histogram"]:
             where_clause, filterVars = SQLExecutor.execute_filter(view)
             required_variables = attributes | set(filterVars)
             required_variables = ",".join(required_variables)
             row_count = list(
                 pd.read_sql(
                     "SELECT COUNT(*) FROM {} {}".format(
                         ldf.table_name, where_clause),
                     ldf.SQLconnection)['count'])[0]
             if row_count > 10000:
                 query = "SELECT {} FROM {} {} ORDER BY random() LIMIT 10000".format(
                     required_variables, ldf.table_name, where_clause)
             else:
                 query = "SELECT {} FROM {} {}".format(
                     required_variables, ldf.table_name, where_clause)
             data = pd.read_sql(query, ldf.SQLconnection)
             view.data = utils.pandas_to_lux(data)
         if (view.mark == "bar" or view.mark == "line"):
             SQLExecutor.execute_aggregate(view, ldf)
         elif (view.mark == "histogram"):
             SQLExecutor.execute_binning(view, ldf)
Exemple #13
0
 def execute_filter(view: Vis):
     where_clause = []
     filters = utils.get_filter_specs(view._inferred_intent)
     filter_vars = []
     if (filters):
         for f in range(0, len(filters)):
             if f == 0:
                 where_clause.append("WHERE")
             else:
                 where_clause.append("AND")
             where_clause.extend([
                 str(filters[f].attribute),
                 str(filters[f].filter_op),
                 "'" + str(filters[f].value) + "'"
             ])
             if filters[f].attribute not in filter_vars:
                 filter_vars.append(filters[f].attribute)
     if where_clause == []:
         return ("", [])
     else:
         where_clause = " ".join(where_clause)
     return (where_clause, filter_vars)
Exemple #14
0
def generalize(ldf):
    #for benchmarking
    if ldf.toggle_benchmarking == True:
        tic = time.perf_counter()
    '''
	Generates all possible visualizations when one attribute or filter from the current view is removed.

	Parameters
	----------
	ldf : lux.luxDataFrame.LuxDataFrame
		LuxDataFrame with underspecified context.

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Generalize action.
	'''
    # takes in a dataObject and generates a list of new dataObjects, each with a single measure from the original object removed
    # -->  return list of dataObjects with corresponding interestingness scores

    recommendation = {
        "action":
        "Generalize",
        "description":
        "Remove one attribute or filter to observe a more general trend."
    }
    output = []
    excluded_columns = []
    column_spec = list(
        filter(lambda x: x.value == "" and x.attribute != "Record",
               ldf.context))
    row_specs = utils.get_filter_specs(ldf.context)
    # if we do no have enough column attributes or too many, return no views.
    if (len(column_spec) < 2 or len(column_spec) > 4):
        recommendation["collection"] = []
        return recommendation
    for spec in column_spec:
        columns = spec.attribute
        if type(columns) == list:
            for column in columns:
                if column not in excluded_columns:
                    temp_view = View(ldf.context)
                    temp_view.remove_column_from_spec_new(column,
                                                          remove_first=True)
                    excluded_columns.append(column)
                    output.append(temp_view)
        elif type(columns) == str:
            if columns not in excluded_columns:
                temp_view = View(ldf.context)
                temp_view.remove_column_from_spec_new(columns,
                                                      remove_first=True)
                excluded_columns.append(columns)
        output.append(temp_view)
    for i, spec in enumerate(row_specs):
        new_spec = ldf.context.copy()
        new_spec.pop(i)
        temp_view = View(new_spec)
        output.append(temp_view)

    vc = lux.view.ViewCollection.ViewCollection(output)
    vc = vc.load(ldf)
    recommendation["collection"] = vc
    for view in vc:
        view.score = interestingness(view, ldf)
    vc.sort(remove_invalid=True)
    vc.remove_duplicates()
    #for benchmarking
    if ldf.toggle_benchmarking == True:
        toc = time.perf_counter()
        print(f"Performed generalize action in {toc - tic:0.4f} seconds")
    return recommendation
Exemple #15
0
def enhance(ldf):
    #for benchmarking
    if ldf.toggle_benchmarking == True:
        tic = time.perf_counter()
    '''
	Given a set of views, generates possible visualizations when an additional attribute is added to the current vis.

	Parameters
	----------
	ldf : lux.luxDataFrame.LuxDataFrame
		LuxDataFrame with underspecified intent.

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Enhance action.
	'''

    filters = utils.get_filter_specs(ldf.intent)
    # Collect variables that already exist in the intent
    attr_specs = list(
        filter(lambda x: x.value == "" and x.attribute != "Record",
               ldf.intent))
    fltr_str = [
        fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters
    ]
    attr_str = [clause.attribute for clause in attr_specs]
    intended_attrs = '<p class="highlight-intent">' + ', '.join(
        attr_str + fltr_str) + '</p>'
    if (len(attr_specs) == 1):
        recommendation = {
            "action":
            "Enhance",
            "description":
            f"Augmenting current {intended_attrs} intent with additional attribute."
        }
    elif (len(attr_specs) == 2):
        recommendation = {
            "action":
            "Enhance",
            "description":
            f"Further breaking down current {intended_attrs} intent by additional attribute."
        }
    elif (
            len(attr_specs) > 2
    ):  # if there are too many column attributes, return don't generate Enhance recommendations
        recommendation = {"action": "Enhance"}
        recommendation["collection"] = []
        return recommendation
    intent = ldf.intent.copy()
    intent = filters + attr_specs
    intent.append("?")
    vc = lux.vis.VisList.VisList(intent, ldf)

    # Then use the data populated in the vis list to compute score
    for view in vc:
        view.score = interestingness(view, ldf)

    vc = vc.topK(15)
    recommendation["collection"] = vc
    #for benchmarking
    if ldf.toggle_benchmarking == True:
        toc = time.perf_counter()
        print(f"Performed enhance action in {toc - tic:0.4f} seconds")
    return recommendation
Exemple #16
0
def filter(ldf):
    """
    Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data.

    Parameters
    ----------
    ldf : lux.core.frame
            LuxDataFrame with underspecified intent.

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Filter action.
    """
    filters = utils.get_filter_specs(ldf._intent)
    filter_values = []
    output = []
    # if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical variable
    column_spec = utils.get_attrs_specs(ldf.current_vis[0]._inferred_intent)
    column_spec_attr = map(lambda x: x.attribute, column_spec)
    if len(filters) == 1:
        # get unique values for all categorical values specified and creates corresponding filters
        fltr = filters[0]

        if ldf.data_type_lookup[fltr.attribute] == "nominal":
            recommendation = {
                "action":
                "Filter",
                "description":
                f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value.",
            }
            unique_values = ldf.unique_values[fltr.attribute]
            filter_values.append(fltr.value)
            # creates vis with new filters
            for val in unique_values:
                if val not in filter_values:
                    new_spec = column_spec.copy()
                    new_filter = lux.Clause(attribute=fltr.attribute,
                                            value=val)
                    new_spec.append(new_filter)
                    temp_vis = Vis(new_spec)
                    output.append(temp_vis)
        elif ldf.data_type_lookup[fltr.attribute] == "quantitative":
            recommendation = {
                "action":
                "Filter",
                "description":
                f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative inequality operation.",
            }

            def get_complementary_ops(fltr_op):
                if fltr_op == ">":
                    return "<="
                elif fltr_op == "<":
                    return ">="
                elif fltr_op == ">=":
                    return "<"
                elif fltr_op == "<=":
                    return ">"
                # TODO: need to support case where fltr_op is "=" --> auto-binned ranges

            # Create vis with complementary filter operations
            new_spec = column_spec.copy()
            new_filter = lux.Clause(
                attribute=fltr.attribute,
                filter_op=get_complementary_ops(fltr.filter_op),
                value=fltr.value,
            )
            new_spec.append(new_filter)
            temp_vis = Vis(new_spec, score=1)
            output.append(temp_vis)
    # if no existing filters, create filters using unique values from all categorical variables in the dataset
    else:
        intended_attrs = ", ".join([
            clause.attribute for clause in ldf._intent
            if clause.value == "" and clause.attribute != "Record"
        ])
        recommendation = {
            "action":
            "Filter",
            "description":
            f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent.",
        }
        categorical_vars = []
        for col in list(ldf.columns):
            # if cardinality is not too high, and attribute is not one of the X,Y (specified) column
            if ldf.cardinality[col] < 30 and col not in column_spec_attr:
                categorical_vars.append(col)
        for cat in categorical_vars:
            unique_values = ldf.unique_values[cat]
            for i in range(0, len(unique_values)):
                new_spec = column_spec.copy()
                new_filter = lux.Clause(attribute=cat,
                                        filter_op="=",
                                        value=unique_values[i])
                new_spec.append(new_filter)
                temp_vis = Vis(new_spec)
                output.append(temp_vis)
    vlist = lux.vis.VisList.VisList(output, ldf)
    for vis in vlist:
        vis.score = interestingness(vis, ldf)
    vlist = vlist.topK(15)
    recommendation["collection"] = vlist
    return recommendation
Exemple #17
0
def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True):
    """
    Generates bivariate visualizations that represent all pairwise relationships in the data.

    Parameters
    ----------
    ldf : LuxDataFrame
            LuxDataFrame with underspecified intent.

    ignore_transpose: bool
            Boolean flag to ignore pairs of attributes whose transpose are already computed (i.e., {X,Y} will be ignored if {Y,X} is already computed)

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Correlation action.
    """

    import numpy as np

    filter_specs = utils.get_filter_specs(ldf._intent)
    intent = [
        lux.Clause("?", data_model="measure"),
        lux.Clause("?", data_model="measure"),
    ]
    intent.extend(filter_specs)
    vlist = VisList(intent, ldf)
    recommendation = {
        "action":
        "Correlation",
        "description":
        "Show relationships between two <p class='highlight-descriptor'>quantitative</p> attributes.",
    }
    ignore_rec_flag = False
    # Doesn't make sense to compute correlation if less than 4 data values
    if len(ldf) < 5:
        ignore_rec_flag = True
    # Then use the data populated in the vis list to compute score
    for vis in vlist:
        measures = vis.get_attr_by_data_model("measure")
        if len(measures) < 2:
            raise ValueError(
                f"Can not compute correlation between {[x.attribute for x in ldf.columns]} since less than 2 measure values present."
            )
        msr1 = measures[0].attribute
        msr2 = measures[1].attribute

        if ignore_transpose:
            check_transpose = check_transpose_not_computed(vlist, msr1, msr2)
        else:
            check_transpose = True
        if check_transpose:
            vis.score = interestingness(vis, ldf)
        else:
            vis.score = -1
    if ignore_rec_flag:
        recommendation["collection"] = []
        return recommendation
    vlist.sort()
    vlist = vlist.showK()
    recommendation["collection"] = vlist
    return recommendation
Exemple #18
0
def interestingness(view: View, ldf: LuxDataFrame) -> int:
    """
	Compute the interestingness score of the view.
	The interestingness metric is dependent on the view type.

	Parameters
	----------
	view : View
	ldf : LuxDataFrame

	Returns
	-------
	int
		Interestingness Score
	"""

    if view.data is None:
        raise Exception(
            "View.data needs to be populated before interestingness can be computed. Run Executor.execute(view,ldf)."
        )

    n_dim = 0
    n_msr = 0

    filter_specs = utils.get_filter_specs(view.spec_lst)
    view_attrs_specs = utils.get_attrs_specs(view.spec_lst)

    for spec in view_attrs_specs:
        if (spec.attribute != "Record"):
            if (spec.data_model == 'dimension'):
                n_dim += 1
            if (spec.data_model == 'measure'):
                n_msr += 1
    n_filter = len(filter_specs)
    attr_specs = [
        spec for spec in view_attrs_specs if spec.attribute != "Record"
    ]
    dimension_lst = view.get_attr_by_data_model("dimension")
    measure_lst = view.get_attr_by_data_model("measure")

    # Bar Chart
    if (n_dim == 1 and (n_msr == 0 or n_msr == 1)):
        if (n_filter == 0):
            return unevenness(view, ldf, measure_lst, dimension_lst)
        elif (n_filter == 1):
            return deviation_from_overall(view, ldf, filter_specs,
                                          measure_lst[0].attribute)
    # Histogram
    elif (n_dim == 0 and n_msr == 1):
        if (n_filter == 0):
            v = view.data["Count of Records"]
            return skewness(v)
        elif (n_filter == 1):
            return deviation_from_overall(view, ldf, filter_specs,
                                          "Count of Records")
    # Scatter Plot
    elif (n_dim == 0 and n_msr == 2):
        if (n_filter == 1):
            v_filter_size = get_filtered_size(filter_specs, view.data)
            v_size = len(view.data)
            sig = v_filter_size / v_size
        else:
            sig = 1
        return sig * monotonicity(view, attr_specs)
    # Scatterplot colored by Dimension
    elif (n_dim == 1 and n_msr == 2):
        color_attr = view.get_attr_by_channel("color")[0].attribute

        C = ldf.cardinality[color_attr]
        if (C < 40):
            return 1 / C
        else:
            return -1
    # Scatterplot colored by dimension
    elif (n_dim == 1 and n_msr == 2):
        return 0.2
    # Scatterplot colored by measure
    elif (n_msr == 3):
        return 0.1
    # Default
    else:
        return -1
Exemple #19
0
def univariate(ldf, data_type_constraint="quantitative"):
    '''
	Generates bar chart distributions of different attributes in the dataframe.

	Parameters
	----------
	ldf : lux.core.frame
		LuxDataFrame with underspecified intent.

	data_type_constraint: str
		Controls the type of distribution chart that will be rendered.

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Distribution action.
	'''
    import numpy as np

    filter_specs = utils.get_filter_specs(ldf._intent)
    ignore_rec_flag = False
    if (data_type_constraint == "quantitative"):
        intent = [
            lux.Clause("?",
                       data_type="quantitative",
                       exclude="Number of Records")
        ]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Distribution",
            "description":
            "Show univariate histograms of <p class='highlight-descriptor'>quantitative</p>  attributes."
        }
        if (
                len(ldf) < 5
        ):  # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated)
            ignore_rec_flag = True
    elif (data_type_constraint == "nominal"):
        intent = [lux.Clause("?", data_type="nominal")]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Occurrence",
            "description":
            "Show frequency of occurrence for <p class='highlight-descriptor'>categorical</p> attributes."
        }
    elif (data_type_constraint == "temporal"):
        intent = [lux.Clause("?", data_type="temporal")]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Temporal",
            "description":
            "Show trends over <p class='highlight-descriptor'>time-related</p> attributes."
        }
        if (
                len(ldf) < 3
        ):  # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated)
            ignore_rec_flag = True
    if (ignore_rec_flag):
        recommendation["collection"] = []
        return recommendation
    vlist = VisList(intent, ldf)
    for vis in vlist:
        vis.score = interestingness(vis, ldf)
    # vlist = vlist.topK(15) # Basic visualizations should not be capped
    vlist.sort()
    recommendation["collection"] = vlist
    return recommendation
Exemple #20
0
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
    """
    Compute the interestingness score of the vis.
    The interestingness metric is dependent on the vis type.

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame

    Returns
    -------
    int
            Interestingness Score
    """

    if vis.data is None or len(vis.data) == 0:
        return -1
        # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")

    n_dim = 0
    n_msr = 0

    filter_specs = utils.get_filter_specs(vis._inferred_intent)
    vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)

    record_attrs = list(
        filter(
            lambda x: x.attribute == "Record" and x.data_model == "measure",
            vis_attrs_specs,
        ))
    n_record = len(record_attrs)
    for clause in vis_attrs_specs:
        if clause.attribute != "Record":
            if clause.data_model == "dimension":
                n_dim += 1
            if clause.data_model == "measure":
                n_msr += 1
    n_filter = len(filter_specs)
    attr_specs = [
        clause for clause in vis_attrs_specs if clause.attribute != "Record"
    ]
    dimension_lst = vis.get_attr_by_data_model("dimension")
    measure_lst = vis.get_attr_by_data_model("measure")
    v_size = len(vis.data)
    # Line/Bar Chart
    # print("r:", n_record, "m:", n_msr, "d:",n_dim)
    if n_dim == 1 and (n_msr == 0 or n_msr == 1):
        if v_size < 2:
            return -1
        if n_filter == 0:
            return unevenness(vis, ldf, measure_lst, dimension_lst)
        elif n_filter == 1:
            return deviation_from_overall(vis, ldf, filter_specs,
                                          measure_lst[0].attribute)
    # Histogram
    elif n_dim == 0 and n_msr == 1:
        if v_size < 2:
            return -1
        if n_filter == 0 and "Number of Records" in vis.data:
            if "Number of Records" in vis.data:
                v = vis.data["Number of Records"]
                return skewness(v)
        elif n_filter == 1 and "Number of Records" in vis.data:
            return deviation_from_overall(vis, ldf, filter_specs,
                                          "Number of Records")
        return -1
    # Scatter Plot
    elif n_dim == 0 and n_msr == 2:
        if v_size < 10:
            return -1
        if vis.mark == "heatmap":
            return weighted_correlation(vis.data["xBinStart"],
                                        vis.data["yBinStart"],
                                        vis.data["count"])
        if n_filter == 1:
            v_filter_size = get_filtered_size(filter_specs, vis.data)
            sig = v_filter_size / v_size
        else:
            sig = 1
        return sig * monotonicity(vis, attr_specs)
    # Scatterplot colored by Dimension
    elif n_dim == 1 and n_msr == 2:
        if v_size < 10:
            return -1
        color_attr = vis.get_attr_by_channel("color")[0].attribute

        C = ldf.cardinality[color_attr]
        if C < 40:
            return 1 / C
        else:
            return -1
    # Scatterplot colored by dimension
    elif n_dim == 1 and n_msr == 2:
        return 0.2
    # Scatterplot colored by measure
    elif n_msr == 3:
        return 0.1
    # colored line and barchart cases
    elif vis.mark == "line" and n_dim == 2:
        return 0.15
    # for colored bar chart, scoring based on Chi-square test for independence score.
    # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
    elif vis.mark == "bar" and n_dim == 2:
        from scipy.stats import chi2_contingency

        measure_column = vis.get_attr_by_data_model("measure")[0].attribute
        dimension_columns = vis.get_attr_by_data_model("dimension")

        groupby_column = dimension_columns[0].attribute
        color_column = dimension_columns[1].attribute

        contingency_table = []
        groupby_cardinality = ldf.cardinality[groupby_column]
        groupby_unique_vals = ldf.unique_values[groupby_column]
        for c in range(0, groupby_cardinality):
            contingency_table.append(
                vis.data[vis.data[groupby_column] ==
                         groupby_unique_vals[c]][measure_column])
        score = 0.12
        # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in
        # a category having no counts

        try:
            color_cardinality = ldf.cardinality[color_column]
            # scale down score based on number of categories
            chi2_score = chi2_contingency(contingency_table)[0] * 0.9**(
                color_cardinality + groupby_cardinality)
            score = min(0.10, chi2_score)
        except ValueError:
            pass
        return score
    # Default
    else:
        return -1
Exemple #21
0
def interestingness(vis:Vis ,ldf:LuxDataFrame) -> int:
	"""
	Compute the interestingness score of the vis.
	The interestingness metric is dependent on the vis type.

	Parameters
	----------
	vis : Vis
	ldf : LuxDataFrame

	Returns
	-------
	int
		Interestingness Score
	"""	
	

	if vis.data is None or len(vis.data)==0:
		raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")

	n_dim = 0
	n_msr = 0
	
	filter_specs = utils.get_filter_specs(vis._inferred_intent)
	vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)

	record_attrs = list(filter(lambda x: x.attribute=="Record" and x.data_model=="measure", vis_attrs_specs))
	n_record = len(record_attrs)
	for clause in vis_attrs_specs:
		if (clause.attribute!="Record"):
			if (clause.data_model == 'dimension'):
				n_dim += 1
			if (clause.data_model == 'measure'):
				n_msr += 1
	n_filter = len(filter_specs)
	attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
	dimension_lst = vis.get_attr_by_data_model("dimension")
	measure_lst = vis.get_attr_by_data_model("measure")
	v_size = len(vis.data)
	# Line/Bar Chart
	#print("r:", n_record, "m:", n_msr, "d:",n_dim)
	if (n_dim == 1 and (n_msr==0 or n_msr==1)):
		if (v_size<2): return -1 
		if (n_filter == 0):
			return unevenness(vis, ldf, measure_lst, dimension_lst)
		elif(n_filter==1):
			return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
	# Histogram
	elif (n_dim == 0 and n_msr == 1):
		if (v_size<2): return -1 
		if (n_filter == 0):
			v = vis.data["Number of Records"]
			return skewness(v)
		elif (n_filter == 1):
			return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
	# Scatter Plot
	elif (n_dim == 0 and n_msr == 2):
		if (v_size<2): return -1 
		if (n_filter==1):
			v_filter_size = get_filtered_size(filter_specs, vis.data)
			sig = v_filter_size/v_size
		else:
			sig = 1
		return sig * monotonicity(vis,attr_specs)
	# Scatterplot colored by Dimension
	elif (n_dim == 1 and n_msr == 2):
		if (v_size<5): return -1 
		color_attr = vis.get_attr_by_channel("color")[0].attribute
		
		C = ldf.cardinality[color_attr]
		if (C<40):
			return 1/C
		else:
			return -1
	# Scatterplot colored by dimension
	elif (n_dim== 1 and n_msr == 2):
		return 0.2
	# Scatterplot colored by measure
	elif (n_msr == 3):
		return 0.1	
	# colored line and barchart cases
	elif ((vis.mark == "line" or vis.mark == "bar") and n_dim == 2):
		return 0.2
	# Default
	else:
		return -1
Exemple #22
0
def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
    """
    Compute the interestingness score of the vis.
    The interestingness metric is dependent on the vis type.

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame

    Returns
    -------
    int
            Interestingness Score
    """

    if vis.data is None or len(vis.data) == 0:
        return -1
        # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).")
    try:
        filter_specs = utils.get_filter_specs(vis._inferred_intent)
        vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent)
        n_dim = vis._ndim
        n_msr = vis._nmsr
        n_filter = len(filter_specs)
        attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"]
        dimension_lst = vis.get_attr_by_data_model("dimension")
        measure_lst = vis.get_attr_by_data_model("measure")
        v_size = len(vis.data)

        if (
            n_dim == 1
            and (n_msr == 0 or n_msr == 1)
            and ldf.current_vis is not None
            and vis.get_attr_by_channel("y")[0].data_type == "quantitative"
            and len(ldf.current_vis) == 1
            and ldf.current_vis[0].mark == "line"
            and len(get_filter_specs(ldf.intent)) > 0
        ):
            query_vc = VisList(ldf.current_vis, ldf)
            query_vis = query_vc[0]
            preprocess(query_vis)
            preprocess(vis)
            return 1 - euclidean_dist(query_vis, vis)

        # Line/Bar Chart
        # print("r:", n_record, "m:", n_msr, "d:",n_dim)
        if n_dim == 1 and (n_msr == 0 or n_msr == 1):
            if v_size < 2:
                return -1

            if n_filter == 0:
                return unevenness(vis, ldf, measure_lst, dimension_lst)
            elif n_filter == 1:
                return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute)
        # Histogram
        elif n_dim == 0 and n_msr == 1:
            if v_size < 2:
                return -1
            if n_filter == 0 and "Number of Records" in vis.data:
                if "Number of Records" in vis.data:
                    v = vis.data["Number of Records"]
                    return skewness(v)
            elif n_filter == 1 and "Number of Records" in vis.data:
                return deviation_from_overall(vis, ldf, filter_specs, "Number of Records")
            return -1
        # Scatter Plot
        elif n_dim == 0 and n_msr == 2:
            if v_size < 10:
                return -1
            if vis.mark == "heatmap":
                return weighted_correlation(
                    vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]
                )
            if n_filter == 1:
                v_filter_size = get_filtered_size(filter_specs, vis.data)
                sig = v_filter_size / v_size
            else:
                sig = 1
            return sig * monotonicity(vis, attr_specs)
        # Scatterplot colored by Dimension
        elif n_dim == 1 and n_msr == 2:
            if v_size < 10:
                return -1
            color_attr = vis.get_attr_by_channel("color")[0].attribute

            C = ldf.cardinality[color_attr]
            if C < 40:
                return 1 / C
            else:
                return -1
        # Scatterplot colored by dimension
        elif n_dim == 1 and n_msr == 2:
            return 0.2
        # Scatterplot colored by measure
        elif n_msr == 3:
            return 0.1
        # colored line and barchart cases
        elif vis.mark == "line" and n_dim == 2:
            return 0.15
        # for colored bar chart, scoring based on Chi-square test for independence score.
        # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users
        elif vis.mark == "bar" and n_dim == 2:
            from scipy.stats import chi2_contingency

            measure_column = vis.get_attr_by_data_model("measure")[0].attribute
            dimension_columns = vis.get_attr_by_data_model("dimension")

            groupby_column = dimension_columns[0].attribute
            color_column = dimension_columns[1].attribute

            contingency_tbl = pd.crosstab(
                vis.data[groupby_column],
                vis.data[color_column],
                values=vis.data[measure_column],
                aggfunc=sum,
            )

            try:
                color_cardinality = ldf.cardinality[color_column]
                groupby_cardinality = ldf.cardinality[groupby_column]
                # scale down score based on number of categories
                chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** (
                    color_cardinality + groupby_cardinality
                )
                score = min(0.10, chi2_score)
            except (ValueError, KeyError):
                # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts
                score = -1
            return score
        # Default
        else:
            return -1
    except:
        if lux.config.interestingness_fallback:
            # Supress interestingness related issues
            warnings.warn(f"An error occurred when computing interestingness for: {vis}")
            return -1
        else:
            raise
Exemple #23
0
def univariate(ldf, *args):
    """
    Generates bar chart distributions of different attributes in the dataframe.

    Parameters
    ----------
    ldf : lux.core.frame
            LuxDataFrame with underspecified intent.

    data_type_constraint: str
            Controls the type of distribution chart that will be rendered.

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Distribution action.
    """
    import numpy as np

    if len(args) == 0:
        data_type_constraint = "quantitative"
    else:
        data_type_constraint = args[0][0]

    filter_specs = utils.get_filter_specs(ldf._intent)
    ignore_rec_flag = False
    if data_type_constraint == "quantitative":
        possible_attributes = [
            c for c in ldf.columns if ldf.data_type[c] == "quantitative"
            and ldf.cardinality[c] > 5 and c != "Number of Records"
        ]
        intent = [lux.Clause(possible_attributes)]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Distribution",
            "description":
            "Show univariate histograms of <p class='highlight-descriptor'>quantitative</p>  attributes.",
        }
        # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated)
        if len(ldf) < 5:
            ignore_rec_flag = True
    elif data_type_constraint == "nominal":
        intent = [lux.Clause("?", data_type="nominal")]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Occurrence",
            "description":
            "Show frequency of occurrence for <p class='highlight-descriptor'>categorical</p> attributes.",
        }
    elif data_type_constraint == "temporal":
        intent = [lux.Clause("?", data_type="temporal")]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Temporal",
            "description":
            "Show trends over <p class='highlight-descriptor'>time-related</p> attributes.",
        }
        # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated)
        if len(ldf) < 3:
            ignore_rec_flag = True
    if ignore_rec_flag:
        recommendation["collection"] = []
        return recommendation
    vlist = VisList(intent, ldf)
    for vis in vlist:
        vis.score = interestingness(vis, ldf)
    vlist.sort()
    recommendation["collection"] = vlist
    return recommendation
Exemple #24
0
def generalize(ldf):
    """
    Generates all possible visualizations when one attribute or filter from the current vis is removed.

    Parameters
    ----------
    ldf : lux.core.frame
            LuxDataFrame with underspecified intent.

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Generalize action.
    """
    # takes in a dataObject and generates a list of new dataObjects, each with a single measure from the original object removed
    # -->  return list of dataObjects with corresponding interestingness scores

    output = []
    excluded_columns = []
    attributes = list(
        filter(lambda x: x.value == "" and x.attribute != "Record",
               ldf._intent))
    filters = utils.get_filter_specs(ldf._intent)

    fltr_str = [
        fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters
    ]
    attr_str = [clause.attribute for clause in attributes]
    intended_attrs = ('<p class="highlight-intent">' +
                      ", ".join(attr_str + fltr_str) + "</p>")

    recommendation = {
        "action": "Generalize",
        "description": f"Remove an attribute or filter from {intended_attrs}.",
    }
    # to observe a more general trend
    # if we do no have enough column attributes or too many, return no vis.
    if len(attributes) < 1 or len(attributes) > 4:
        recommendation["collection"] = []
        return recommendation
    # for each column specification, create a copy of the ldf's vis and remove the column specification
    # then append the vis to the output
    if len(attributes) > 1:
        for clause in attributes:
            columns = clause.attribute
            if type(columns) == list:
                for column in columns:
                    if column not in excluded_columns:
                        temp_vis = Vis(ldf.copy_intent(), score=1)
                        temp_vis.remove_column_from_spec(column,
                                                         remove_first=True)
                        excluded_columns.append(column)
                        output.append(temp_vis)
            elif type(columns) == str:
                if columns not in excluded_columns:
                    temp_vis = Vis(ldf.copy_intent(), score=1)
                    temp_vis.remove_column_from_spec(columns,
                                                     remove_first=True)
                    excluded_columns.append(columns)
            output.append(temp_vis)
    # for each filter specification, create a copy of the ldf's current vis and remove the filter specification,
    # then append the vis to the output
    for clause in filters:
        # new_spec = ldf._intent.copy()
        # new_spec.remove_column_from_spec(new_spec.attribute)
        temp_vis = Vis(
            ldf.current_vis[0]._inferred_intent.copy(),
            source=ldf,
            title="Overall",
            score=0,
        )
        temp_vis.remove_filter_from_spec(clause.value)
        output.append(temp_vis)

    vlist = lux.vis.VisList.VisList(output, source=ldf)
    # Ignore interestingness sorting since Generalize yields very few vis (preserve order of remove attribute, then remove filters)
    # for vis in vlist:
    # 	vis.score = interestingness(vis,ldf)

    vlist.remove_duplicates()
    vlist.sort(remove_invalid=True)
    recommendation["collection"] = vlist
    return recommendation
Exemple #25
0
def filter(ldf):
    #for benchmarking
    if ldf.toggle_benchmarking == True:
        tic = time.perf_counter()
    '''
	Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data.

	Parameters
	----------
	ldf : lux.luxDataFrame.LuxDataFrame
		LuxDataFrame with underspecified context.

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Filter action.
	'''
    recommendation = {
        "action":
        "Filter",
        "description":
        "Shows possible visualizations when filtered by categorical variables in the dataset."
    }
    filters = utils.get_filter_specs(ldf.context)
    filter_values = []
    output = []
    #if Row is specified, create visualizations where data is filtered by all values of the Row's categorical variable
    column_spec = utils.get_attrs_specs(ldf.current_view[0].spec_lst)
    column_spec_attr = map(lambda x: x.attribute, column_spec)
    if len(filters) > 0:
        #get unique values for all categorical values specified and creates corresponding filters
        for row in filters:
            unique_values = ldf.unique_values[row.attribute]
            filter_values.append(row.value)
            #creates views with new filters
            for val in unique_values:
                if val not in filter_values:
                    new_spec = column_spec.copy()
                    new_filter = lux.Spec(attribute=row.attribute, value=val)
                    new_spec.append(new_filter)
                    temp_view = View(new_spec)
                    output.append(temp_view)
    else:  #if no existing filters, create filters using unique values from all categorical variables in the dataset
        categorical_vars = []
        for col in list(ldf.columns):
            # if cardinality is not too high, and attribute is not one of the X,Y (specified) column
            if ldf.cardinality[col] < 40 and col not in column_spec_attr:
                categorical_vars.append(col)
        for cat in categorical_vars:
            unique_values = ldf.unique_values[cat]
            for i in range(0, len(unique_values)):
                new_spec = column_spec.copy()
                new_filter = lux.Spec(attribute=cat,
                                      filter_op="=",
                                      value=unique_values[i])
                new_spec.append(new_filter)
                temp_view = View(new_spec)
                output.append(temp_view)
    vc = lux.view.ViewCollection.ViewCollection(output)
    vc = vc.load(ldf)
    for view in vc:
        view.score = interestingness(view, ldf)
    vc = vc.topK(15)
    recommendation["collection"] = vc

    #for benchmarking
    if ldf.toggle_benchmarking == True:
        toc = time.perf_counter()
        print(f"Performed filter action in {toc - tic:0.4f} seconds")
    return recommendation
Exemple #26
0
def add_filter(ldf):
    """
    Iterates over all possible values of a categorical variable and generates visualizations where each categorical
    value filters the data.

    Parameters
    ----------
    ldf : lux.core.frame
            LuxDataFrame with underspecified intent.

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Filter action.
    """
    filters = utils.get_filter_specs(ldf._intent)
    filter_values = []
    output = []
    # if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical
    # variable
    column_spec = utils.get_attrs_specs(ldf.current_vis[0].intent)
    column_spec_attr = list(map(lambda x: x.attribute, column_spec))
    if len(filters) == 1:
        # get unique values for all categorical values specified and creates corresponding filters
        fltr = filters[0]

        if ldf.data_type[fltr.attribute] == "nominal":
            recommendation = {
                "action": "Filter",
                "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an "
                               f"alternative value.",
                "long_description": f"Swap out the filter value for {fltr.attribute} to other possible values, while "
                                    f"keeping all else the same. Visualizations are ranked based on interestingness",
            }
            unique_values = ldf.unique_values[fltr.attribute]
            filter_values.append(fltr.value)
            # creates vis with new filters
            for val in unique_values:
                if val not in filter_values:
                    new_spec = column_spec.copy()
                    new_filter = lux.Clause(attribute=fltr.attribute, value=val)
                    new_spec.append(new_filter)
                    temp_vis = Vis(new_spec)
                    output.append(temp_vis)
        elif ldf.data_type[fltr.attribute] == "quantitative":
            recommendation = {
                "action": "Filter",
                "description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an "
                               f"alternative inequality operation.",
                "long_description": f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an "
                                    f"alternative inequality operation.",
            }

            # Create vis with complementary filter operations
            # NOTE: This section of code has been modified to allow for the rendering of multiple vis
            for op in get_complementary_ops(fltr.filter_op):
                new_spec = column_spec.copy()
                new_filter = lux.Clause(
                    attribute=fltr.attribute,
                    filter_op=op,
                    value=fltr.value,
                )
                new_spec.append(new_filter)
                temp_vis = Vis(new_spec, score=1)
                output.append(temp_vis)

    # if no existing filters, create filters using unique values from all categorical variables in the dataset
    else:
        intended_attrs = ", ".join(
            [
                str(clause.attribute)
                for clause in ldf._intent
                if clause.value == "" and clause.attribute != "Record"
            ]
        )
        recommendation = {
            "action": "Filter",
            "description": f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent.",
            "long_description": f"Adding any filter while keeping the attributes on the x and y axes fixed. "
                                f"Visualizations are ranked based on interestingness",
        }
        categorical_vars = []
        for col in list(ldf.columns):
            # if cardinality is not too high, and attribute is not one of the X,Y (specified) column
            if 1 < ldf.cardinality[col] < 30 and col not in column_spec_attr:
                categorical_vars.append(col)
        for cat in categorical_vars:
            unique_values = ldf.unique_values[cat]
            for val in unique_values:
                new_spec = column_spec.copy()
                new_filter = lux.Clause(attribute=cat, filter_op="=", value=val)
                new_spec.append(new_filter)
                temp_vis = Vis(new_spec)
                output.append(temp_vis)
    if (
        ldf.current_vis is not None
        and len(ldf.current_vis) == 1
        and ldf.current_vis[0].mark == "line"
        and len(get_filter_specs(ldf.intent)) > 0
    ):
        recommendation = {
            "action": "Similarity",
            "description": "Show other charts that are visually similar to the Current vis.",
            "long_description": "Show other charts that are visually similar to the Current vis.",
        }
        last = get_filter_specs(ldf.intent)[-1]
        output = ldf.intent.copy()[0:-1]
        # array of possible values for attribute
        arr = ldf[last.attribute].unique().tolist()
        output.append(lux.Clause(last.attribute, last.attribute, arr))
    vlist = lux.vis.VisList.VisList(output, ldf)
    vlist_copy = lux.vis.VisList.VisList(output, ldf)
    for i in range(len(vlist_copy)):
        vlist[i].score = interestingness(vlist_copy[i], ldf)
    vlist.sort()
    vlist = vlist.showK()
    if recommendation["action"] == "Similarity":
        recommendation["collection"] = vlist[1:]
    else:
        recommendation["collection"] = vlist
    return recommendation
Exemple #27
0
def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True):
    '''
	Generates bivariate visualizations that represent all pairwise relationships in the data.

	Parameters
	----------
	ldf : LuxDataFrame
		LuxDataFrame with underspecified intent.

	ignore_transpose: bool
		Boolean flag to ignore pairs of attributes whose transpose are already computed (i.e., {X,Y} will be ignored if {Y,X} is already computed)

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Correlation action.
	'''

    import numpy as np
    # for benchmarking
    if ldf.toggle_benchmarking == True:
        tic = time.perf_counter()
    filter_specs = utils.get_filter_specs(ldf.intent)
    intent = [
        lux.Clause("?", data_model="measure"),
        lux.Clause("?", data_model="measure")
    ]
    intent.extend(filter_specs)
    vc = VisList(intent, ldf)
    recommendation = {
        "action":
        "Correlation",
        "description":
        "Show relationships between two <p class='highlight-descriptor'>quantitative</p> attributes."
    }
    ignore_rec_flag = False
    if (
            len(ldf) < 5
    ):  # Doesn't make sense to compute correlation if less than 4 data values
        ignore_rec_flag = True
    # Then use the data populated in the vis list to compute score
    for view in vc:
        measures = view.get_attr_by_data_model("measure")
        if len(measures) < 2:
            raise ValueError(
                f"Can not compute correlation between {[x.attribute for x in ldf.columns]} since less than 2 measure values present."
            )
        msr1 = measures[0].attribute
        msr2 = measures[1].attribute

        if (ignore_transpose):
            check_transpose = check_transpose_not_computed(vc, msr1, msr2)
        else:
            check_transpose = True
        if (check_transpose):
            view.score = interestingness(view, ldf)
        else:
            view.score = -1
    if (ignore_rec_flag):
        recommendation["collection"] = []
        return recommendation
    vc = vc.topK(15)
    recommendation["collection"] = vc

    # for benchmarking
    if ldf.toggle_benchmarking == True:
        toc = time.perf_counter()
        print(f"Performed correlation action in {toc - tic:0.4f} seconds")
    return recommendation
Exemple #28
0
def univariate(ldf, *args):
    """
    Generates bar chart distributions of different attributes in the dataframe.

    Parameters
    ----------
    ldf : lux.core.frame
            LuxDataFrame with underspecified intent.

    data_type_constraint: str
            Controls the type of distribution chart that will be rendered.

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Distribution action.
    """
    import numpy as np

    if len(args) == 0:
        data_type_constraint = "quantitative"
    else:
        data_type_constraint = args[0][0]

    filter_specs = utils.get_filter_specs(ldf._intent)
    ignore_rec_flag = False
    if data_type_constraint == "quantitative":
        possible_attributes = [
            c for c in ldf.columns if ldf.data_type[c] == "quantitative"
            and ldf.cardinality[c] > 5 and c != "Number of Records"
        ]
        intent = [lux.Clause(possible_attributes)]
        intent.extend(filter_specs)
        examples = ""
        if len(possible_attributes) >= 1:
            examples = f" (e.g., {possible_attributes[0]})"
        recommendation = {
            "action":
            "Distribution",
            "description":
            "Show univariate histograms of <p class='highlight-descriptor'>quantitative</p>  attributes.",
            "long_description":
            f"Distribution displays univariate histogram distributions of all quantitative attributes{examples}. Visualizations are ranked from most to least skewed.",
        }
        # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated)
        if ldf.length < 5:
            ignore_rec_flag = True
    elif data_type_constraint == "nominal":
        possible_attributes = [
            c for c in ldf.columns if ldf.data_type[c] == "nominal"
            and ldf.cardinality[c] > 5 and c != "Number of Records"
        ]
        examples = ""
        if len(possible_attributes) >= 1:
            examples = f" (e.g., {possible_attributes[0]})"
        intent = [lux.Clause("?", data_type="nominal")]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Occurrence",
            "description":
            "Show frequency of occurrence for <p class='highlight-descriptor'>categorical</p> attributes.",
            "long_description":
            f"Occurence displays bar charts of counts for all categorical attributes{examples}. Visualizations are ranked from most to least uneven across the bars. ",
        }
    elif data_type_constraint == "geographical":
        possible_attributes = [
            c for c in ldf.columns if ldf.data_type[c] == "geographical"
            and ldf.cardinality[c] > 5 and c != "Number of Records"
        ]
        examples = ""
        if len(possible_attributes) >= 1:
            examples = f" (e.g., {possible_attributes[0]})"
        intent = [
            lux.Clause("?", data_type="geographical"),
            lux.Clause("?", data_model="measure")
        ]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Geographical",
            "description":
            "Show choropleth maps of <p class='highlight-descriptor'>geographic</p> attributes",
            "long_description":
            f"Occurence displays choropleths of averages for some geographic attribute{examples}. Visualizations are ranked by diversity of the geographic attribute.",
        }
    elif data_type_constraint == "temporal":
        intent = [lux.Clause("?", data_type="temporal")]
        intent.extend(filter_specs)
        recommendation = {
            "action":
            "Temporal",
            "description":
            "Show trends over <p class='highlight-descriptor'>time-related</p> attributes.",
            "long_description":
            "Temporal displays line charts for all attributes related to datetimes in the dataframe.",
        }
        # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated)
        if ldf.length < 3:
            ignore_rec_flag = True
    if ignore_rec_flag:
        recommendation["collection"] = []
        return recommendation
    vlist = VisList(intent, ldf)
    for vis in vlist:
        vis.score = interestingness(vis, ldf)
    vlist.sort()
    recommendation["collection"] = vlist
    return recommendation
Exemple #29
0
    def determine_encoding(ldf: LuxDataFrame, vis: Vis):
        """
        Populates Vis with the appropriate mark type and channel information based on ShowMe logic
        Currently support up to 3 dimensions or measures

        Parameters
        ----------
        ldf : lux.core.frame
                LuxDataFrame with underspecified intent
        vis : lux.vis.Vis

        Returns
        -------
        None

        Notes
        -----
        Implementing automatic encoding from Tableau's VizQL
        Mackinlay, J. D., Hanrahan, P., & Stolte, C. (2007).
        Show Me: Automatic presentation for visual analysis.
        IEEE Transactions on Visualization and Computer Graphics, 13(6), 1137–1144.
        https://doi.org/10.1109/TVCG.2007.70594
        """
        # Count number of measures and dimensions
        ndim = vis._ndim
        nmsr = vis._nmsr
        # preserve to add back to _inferred_intent later
        filters = utils.get_filter_specs(vis._inferred_intent)

        # Helper function (TODO: Move this into utils)
        def line_or_bar(ldf, dimension: Clause, measure: Clause):
            dim_type = dimension.data_type
            # If no aggregation function is specified, then default as average
            if measure.aggregation == "":
                measure.set_aggregation("mean")
            if dim_type == "temporal" or dim_type == "oridinal":
                return "line", {"x": dimension, "y": measure}
            else:  # unordered categorical
                # if cardinality large than 5 then sort bars
                if ldf.cardinality[dimension.attribute] > 5:
                    dimension.sort = "ascending"
                return "bar", {"x": measure, "y": dimension}

        # ShowMe logic + additional heuristics
        # count_col = Clause( attribute="count()", data_model="measure")
        count_col = Clause(
            attribute="Record",
            aggregation="count",
            data_model="measure",
            data_type="quantitative",
        )
        auto_channel = {}
        if ndim == 0 and nmsr == 1:
            # Histogram with Count
            measure = vis.get_attr_by_data_model("measure",
                                                 exclude_record=True)[0]
            if len(vis.get_attr_by_attr_name("Record")) < 0:
                vis._inferred_intent.append(count_col)
            # If no bin specified, then default as 10
            if measure.bin_size == 0:
                measure.bin_size = 10
            auto_channel = {"x": measure, "y": count_col}
            vis._mark = "histogram"
        elif ndim == 1 and (nmsr == 0 or nmsr == 1):
            # Line or Bar Chart
            if nmsr == 0:
                vis._inferred_intent.append(count_col)
            dimension = vis.get_attr_by_data_model("dimension")[0]
            measure = vis.get_attr_by_data_model("measure")[0]
            vis._mark, auto_channel = line_or_bar(ldf, dimension, measure)
        elif ndim == 2 and (nmsr == 0 or nmsr == 1):
            # Line or Bar chart broken down by the dimension
            dimensions = vis.get_attr_by_data_model("dimension")
            d1 = dimensions[0]
            d2 = dimensions[1]
            if ldf.cardinality[d1.attribute] < ldf.cardinality[d2.attribute]:
                # d1.channel = "color"
                vis.remove_column_from_spec(d1.attribute)
                dimension = d2
                color_attr = d1
            else:
                # if same attribute then remove_column_from_spec will remove both dims, we only want to remove one
                if d1.attribute == d2.attribute:
                    vis._inferred_intent.pop(0)
                else:
                    vis.remove_column_from_spec(d2.attribute)
                dimension = d1
                color_attr = d2
            # Colored Bar/Line chart with Count as default measure
            if not ldf.pre_aggregated:
                if nmsr == 0 and not ldf.pre_aggregated:
                    vis._inferred_intent.append(count_col)
                measure = vis.get_attr_by_data_model("measure")[0]
                vis._mark, auto_channel = line_or_bar(ldf, dimension, measure)
                auto_channel["color"] = color_attr
        elif ndim == 0 and nmsr == 2:
            # Scatterplot
            vis._mark = "scatter"
            vis._inferred_intent[0].set_aggregation(None)
            vis._inferred_intent[1].set_aggregation(None)
            auto_channel = {
                "x": vis._inferred_intent[0],
                "y": vis._inferred_intent[1]
            }
        elif ndim == 1 and nmsr == 2:
            # Scatterplot broken down by the dimension
            measure = vis.get_attr_by_data_model("measure")
            m1 = measure[0]
            m2 = measure[1]

            vis._inferred_intent[0].set_aggregation(None)
            vis._inferred_intent[1].set_aggregation(None)

            color_attr = vis.get_attr_by_data_model("dimension")[0]
            vis.remove_column_from_spec(color_attr)
            vis._mark = "scatter"
            auto_channel = {"x": m1, "y": m2, "color": color_attr}
        elif ndim == 0 and nmsr == 3:
            # Scatterplot with color
            vis._mark = "scatter"
            auto_channel = {
                "x": vis._inferred_intent[0],
                "y": vis._inferred_intent[1],
                "color": vis._inferred_intent[2],
            }
        relevant_attributes = [
            auto_channel[channel].attribute for channel in auto_channel
        ]
        relevant_min_max = dict((attr, ldf._min_max[attr])
                                for attr in relevant_attributes
                                if attr != "Record" and attr in ldf._min_max)
        vis._min_max = relevant_min_max
        if auto_channel != {}:
            vis = Compiler.enforce_specified_channel(vis, auto_channel)
            vis._inferred_intent.extend(
                filters)  # add back the preserved filters
Exemple #30
0
def filter(ldf):
	#for benchmarking
	if ldf.toggle_benchmarking == True:
		tic = time.perf_counter()
	'''
	Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data.

	Parameters
	----------
	ldf : lux.luxDataFrame.LuxDataFrame
		LuxDataFrame with underspecified intent.

	Returns
	-------
	recommendations : Dict[str,obj]
		object with a collection of visualizations that result from the Filter action.
	'''
	
	filters = utils.get_filter_specs(ldf.intent)
	filter_values = []
	output = []
	#if fltr is specified, create visualizations where data is filtered by all values of the fltr's categorical variable
	column_spec = utils.get_attrs_specs(ldf.current_vis[0]._inferred_intent)
	column_spec_attr = map(lambda x: x.attribute,column_spec)
	if len(filters) == 1:
		#get unique values for all categorical values specified and creates corresponding filters
		fltr = filters[0]
		unique_values = ldf.unique_values[fltr.attribute]
		filter_values.append(fltr.value)
		#creates views with new filters
		for val in unique_values:
			if val not in filter_values:
				new_spec = column_spec.copy()
				new_filter = lux.Clause(attribute = fltr.attribute, value = val)
				new_spec.append(new_filter)
				temp_view = Vis(new_spec)
				output.append(temp_view)
		recommendation = {"action":"Filter",
					 	  "description":f"Changing the <p class='highlight-intent'>{fltr.attribute}</p> filter to an alternative value."}
	else:	#if no existing filters, create filters using unique values from all categorical variables in the dataset
		intended_attrs = '<b>'+', '.join([clause.attribute for clause in ldf.intent if clause.value=='' and clause.attribute!="Record"])+'</b>'
		recommendation = {"action":"Filter",
					 "description":f"Applying filters to the <p class='highlight-intent'>{intended_attrs}</p> intent."}
		categorical_vars = []
		for col in list(ldf.columns):
			# if cardinality is not too high, and attribute is not one of the X,Y (specified) column
			if ldf.cardinality[col]<30 and col not in column_spec_attr:
				categorical_vars.append(col)
		for cat in categorical_vars:
			unique_values = ldf.unique_values[cat]
			for i in range(0, len(unique_values)):
				new_spec = column_spec.copy()
				new_filter = lux.Clause(attribute=cat, filter_op="=",value=unique_values[i])
				new_spec.append(new_filter)
				temp_view = Vis(new_spec)
				output.append(temp_view)
	vc = lux.vis.VisList.VisList(output,ldf)
	for view in vc:
		view.score = interestingness(view,ldf)
	vc = vc.topK(15)
	recommendation["collection"] = vc
	
	#for benchmarking
	if ldf.toggle_benchmarking == True:
		toc = time.perf_counter()
		print(f"Performed filter action in {toc - tic:0.4f} seconds")
	return recommendation