Ejemplo n.º 1
0
def test_inequalityfilter():
    connection = psycopg2.connect(
        "host=localhost dbname=postgres user=postgres password=lux")
    sql_df = lux.LuxSQLTable()
    lux.config.set_SQL_connection(connection)
    sql_df.set_SQL_table("car")

    vis = Vis([
        lux.Clause(attribute="Horsepower", filter_op=">", value=50),
        lux.Clause(attribute="MilesPerGal"),
    ])
    vis._vis_data = sql_df
    filter_output = SQLExecutor.execute_filter(vis)
    assert filter_output[
        0] == 'WHERE "Horsepower" > \'50\' AND "MilesPerGal" IS NOT NULL'
    assert filter_output[1] == ["Horsepower"]

    intent = [
        lux.Clause(attribute="Horsepower", filter_op="<=", value=100),
        lux.Clause(attribute="MilesPerGal"),
    ]
    vis = Vis(intent, sql_df)
    vis._vis_data = sql_df
    filter_output = SQLExecutor.execute_filter(vis)
    assert filter_output[
        0] == 'WHERE "Horsepower" <= \'100\' AND "MilesPerGal" IS NOT NULL'
    assert filter_output[1] == ["Horsepower"]
Ejemplo n.º 2
0
def test_color_barchart_code_export(global_var):
    tbl = lux.LuxSQLTable()
    tbl.set_SQL_table("cars")

    vis = Vis([lux.Clause("origin"), lux.Clause("cylinders")], tbl)
    SQLExecutor.execute([vis], tbl)
    code = vis.to_code("python")
    try:
        exec(code, globals())
        create_chart_data(tbl, vis)
    except:
        assert False
Ejemplo n.º 3
0
def test_histogram_code_export(global_var):
    tbl = lux.LuxSQLTable()
    tbl.set_SQL_table("cars")

    vis = Vis([lux.Clause("horsepower")], tbl)
    SQLExecutor.execute([vis], tbl)
    code = vis.to_code("python")
    try:
        exec(code, globals())
        create_chart_data(tbl, vis)
    except:
        assert False
Ejemplo n.º 4
0
def test_lazy_execution():
    tbl = lux.LuxSQLTable()
    tbl.set_SQL_table("cars")

    intent = [
        lux.Clause(attribute="horsepower", aggregation="mean"),
        lux.Clause(attribute="origin"),
    ]
    vis = Vis(intent)
    # Check data field in vis is empty before calling executor
    assert vis.data is None
    SQLExecutor.execute([vis], tbl)
    assert type(vis.data) == lux.core.frame.LuxDataFrame
Ejemplo n.º 5
0
def test_lazy_execution():
    connection = psycopg2.connect(
        "host=localhost dbname=postgres user=postgres password=lux")
    sql_df = lux.LuxSQLTable()
    lux.config.set_SQL_connection(connection)
    sql_df.set_SQL_table("car")

    intent = [
        lux.Clause(attribute="Horsepower", aggregation="mean"),
        lux.Clause(attribute="Origin"),
    ]
    vis = Vis(intent)
    # Check data field in vis is empty before calling executor
    assert vis.data is None
    SQLExecutor.execute([vis], sql_df)
    assert type(vis.data) == lux.core.frame.LuxDataFrame
Ejemplo n.º 6
0
    def __init__(self, *args, **kw):
        self._history = History()
        self._intent = []
        self._inferred_intent = []
        self._recommendation = {}
        self._saved_export = None
        self._current_vis = []
        self._prev = None
        self._widget = None
        super(LuxDataFrame, self).__init__(*args, **kw)

        self.table_name = ""
        if lux.config.SQLconnection == "":
            from lux.executor.PandasExecutor import PandasExecutor

            lux.config.executor = PandasExecutor()
        else:
            from lux.executor.SQLExecutor import SQLExecutor

            lux.config.executor = SQLExecutor()

        self._sampled = None
        self._toggle_pandas_display = True
        self._message = Message()
        self._pandas_only = False
        # Metadata
        self._data_type = {}
        self.unique_values = None
        self.cardinality = None
        self._min_max = None
        self.pre_aggregated = None
        self._type_override = {}
        warnings.formatwarning = lux.warning_format
Ejemplo n.º 7
0
    def __init__(self, *args, table_name="", **kw):
        super(LuxSQLTable, self).__init__(*args, **kw)
        from lux.executor.SQLExecutor import SQLExecutor

        lux.config.executor = SQLExecutor()

        self._length = 0
        self._setup_done = False
        if table_name != "":
            self.set_SQL_table(table_name)
        warnings.formatwarning = lux.warning_format
Ejemplo n.º 8
0
Archivo: config.py Proyecto: siwefe/lux
    def set_executor_type(self, exe):
        if exe == "SQL":
            from lux.executor.SQLExecutor import SQLExecutor

            self.executor = SQLExecutor()
        elif exe == "Pandas":
            from lux.executor.PandasExecutor import PandasExecutor

            self.SQLconnection = ""
            self.executor = PandasExecutor()
        else:
            raise ValueError("Executor type must be either 'Pandas' or 'SQL'")
Ejemplo n.º 9
0
    def maintain_metadata(self):
        if lux.config.SQLconnection != "" and lux.config.executor.name != "SQL":
            from lux.executor.SQLExecutor import SQLExecutor

            lux.config.executor = SQLExecutor()

        # Check that metadata has not yet been computed
        if not hasattr(self, "_metadata_fresh") or not self._metadata_fresh:
            # only compute metadata information if the dataframe is non-empty
            if len(self) > 0 or lux.config.executor.name == "SQLExecutor":
                lux.config.executor.compute_stats(self)
                lux.config.executor.compute_dataset_metadata(self)
                self._infer_structure()
                self._metadata_fresh = True
Ejemplo n.º 10
0
def test_inequalityfilter():
    tbl = lux.LuxSQLTable()
    tbl.set_SQL_table("cars")

    vis = Vis(
        [
            lux.Clause(attribute="horsepower", filter_op=">", value=50),
            lux.Clause(attribute="milespergal"),
        ]
    )
    vis._vis_data = tbl
    filter_output = SQLExecutor.execute_filter(vis)
    assert filter_output[0] == 'WHERE "horsepower" > \'50\' AND "milespergal" IS NOT NULL'
    assert filter_output[1] == ["horsepower"]

    intent = [
        lux.Clause(attribute="horsepower", filter_op="<=", value=100),
        lux.Clause(attribute="milespergal"),
    ]
    vis = Vis(intent, tbl)
    vis._vis_data = tbl
    filter_output = SQLExecutor.execute_filter(vis)
    assert filter_output[0] == 'WHERE "horsepower" <= \'100\' AND "milespergal" IS NOT NULL'
    assert filter_output[1] == ["horsepower"]
Ejemplo n.º 11
0
    def read_query_template(self, query_template):
        from lux.executor.SQLExecutor import SQLExecutor

        query_dict = {}
        if type(query_template) is str:
            for line in query_template.split("\n"):
                (key, val) = line.split(":")
                query_dict[key] = val.strip()
        else:
            with open(query_file) as f:
                for line in f:
                    (key, val) = line.split(":")
                    query_dict[key] = val.strip()
        self.query_templates = query_dict
        self.executor = SQLExecutor()
Ejemplo n.º 12
0
    def maintain_metadata(self) -> None:
        """
        Maintain dataset metadata and statistics (Compute only if needed)
        """
        is_sql_tbl = lux.config.executor.name == "SQLExecutor"
        if lux.config.SQLconnection != "" and is_sql_tbl:
            from lux.executor.SQLExecutor import SQLExecutor

            lux.config.executor = SQLExecutor()
        if lux.config.lazy_maintain:
            # Check that metadata has not yet been computed
            if not hasattr(self,
                           "_metadata_fresh") or not self._metadata_fresh:
                # only compute metadata information if the dataframe is non-empty
                self.compute_metadata()
        else:
            self.compute_metadata()
Ejemplo n.º 13
0
    def set_executor_type(self, exe):
        if exe == "SQL":
            import pkgutil

            if pkgutil.find_loader("psycopg2") is None:
                raise ImportError(
                    "psycopg2 is not installed. Run `pip install psycopg2' to install psycopg2 to enable the Postgres connection."
                )
            else:
                import psycopg2
            from lux.executor.SQLExecutor import SQLExecutor

            self.executor = SQLExecutor()
        else:
            from lux.executor.PandasExecutor import PandasExecutor

            self.executor = PandasExecutor()
Ejemplo n.º 14
0
    def __init__(self, *args, joins=[], **kw):
        super(JoinedSQLTable, self).__init__(*args, **kw)
        from lux.executor.SQLExecutor import SQLExecutor

        lux.config.executor = SQLExecutor()
        # self._metadata.joins = []
        tables = self.extract_tables(joins)
        if len(tables) > 4:
            warnings.warn(
                f"\nPlease provide a maximum of 4 (Four) unique tables to ensure optimal performance.",
                stacklevel=2,
            )
        view_name = self.create_view(tables, joins)
        self._length = 0
        if view_name != "":
            self.set_SQL_table(view_name)
            # self._metadata.using_view = True
        warnings.formatwarning = lux.warning_format
Ejemplo n.º 15
0
def test_filter(global_var):
    tbl = lux.LuxSQLTable()
    tbl.set_SQL_table("cars")

    intent = [
        lux.Clause(attribute="horsepower"),
        lux.Clause(attribute="year"),
        lux.Clause(attribute="origin", filter_op="=", value="USA"),
    ]
    vis = Vis(intent, tbl)
    vis._vis_data = tbl
    filter_output = SQLExecutor.execute_filter(vis)
    where_clause = filter_output[0]
    where_clause_list = where_clause.split(" AND ")
    assert ("WHERE \"origin\" = 'USA'" in where_clause_list
            and '"horsepower" IS NOT NULL' in where_clause_list
            and '"year" IS NOT NULL' in where_clause_list)
    assert filter_output[1] == ["origin"]
Ejemplo n.º 16
0
def test_filter():
    connection = psycopg2.connect(
        "host=localhost dbname=postgres user=postgres password=lux")
    sql_df = lux.LuxSQLTable()
    lux.config.set_SQL_connection(connection)
    sql_df.set_SQL_table("car")

    intent = [
        lux.Clause(attribute="Horsepower"),
        lux.Clause(attribute="Year"),
        lux.Clause(attribute="Origin", filter_op="=", value="USA"),
    ]
    vis = Vis(intent, sql_df)
    vis._vis_data = sql_df
    filter_output = SQLExecutor.execute_filter(vis)
    assert (
        filter_output[0] ==
        'WHERE "Origin" = \'USA\' AND "Year" IS NOT NULL AND "Horsepower" IS NOT NULL'
    )
    assert filter_output[1] == ["Origin"]
Ejemplo n.º 17
0
def check_if_id_like(df, attribute):
    import re

    # Strong signals
    # so that aggregated reset_index fields don't get misclassified
    high_cardinality = df.cardinality[attribute] > 500
    attribute_contain_id = re.search(r"id|ID|iD|Id",
                                     str(attribute)) is not None
    almost_all_vals_unique = df.cardinality[attribute] >= 0.98 * len(df)
    is_string = pd.api.types.is_string_dtype(df[attribute])
    if is_string:
        # For string IDs, usually serial numbers or codes with alphanumerics have a consistent length (eg., CG-39405) with little deviation. For a high cardinality string field but not ID field (like Name or Brand), there is less uniformity across the string lengths.
        if len(df) > 50:
            if lux.config.executor.name == "PandasExecutor":
                sampled = df[attribute].sample(50, random_state=99)
            else:
                from lux.executor.SQLExecutor import SQLExecutor

                sampled = SQLExecutor.execute_preview(df, preview_size=50)
        else:
            sampled = df[attribute]
        str_length_uniformity = sampled.apply(
            lambda x: type(x) == str and len(x)).std() < 3
        return (high_cardinality
                and (attribute_contain_id or almost_all_vals_unique)
                and str_length_uniformity)
    else:
        if len(df) >= 2:
            series = df[attribute]
            diff = series.diff()
            evenly_spaced = all(diff.iloc[1:] == diff.iloc[1])
        else:
            evenly_spaced = True
        if attribute_contain_id:
            almost_all_vals_unique = df.cardinality[attribute] >= 0.75 * len(
                df)
        return high_cardinality and (almost_all_vals_unique or evenly_spaced)
Ejemplo n.º 18
0
def deviation_from_overall(
    vis: Vis,
    ldf: LuxDataFrame,
    filter_specs: list,
    msr_attribute: str,
    exclude_nan: bool = True,
) -> int:
    """
    Difference in bar chart/histogram shape from overall chart
    Note: this function assumes that the filtered vis.data is operating on the same range as the unfiltered vis.data.

    Parameters
    ----------
    vis : Vis
    ldf : LuxDataFrame
    filter_specs : list
            List of filters from the Vis
    msr_attribute : str
            The attribute name of the measure value of the chart
    exclude_nan: bool
            Whether to include/exclude NaN values as part of the deviation calculation

    Returns
    -------
    int
            Score describing how different the vis is from the overall vis
    """
    if lux.config.executor.name == "PandasExecutor":
        if exclude_nan:
            vdata = vis.data.dropna()
        else:
            vdata = vis.data
        v_filter_size = get_filtered_size(filter_specs, ldf)
        v_size = len(vis.data)
    elif lux.config.executor.name == "SQLExecutor":
        from lux.executor.SQLExecutor import SQLExecutor

        v_filter_size = SQLExecutor.get_filtered_size(filter_specs, ldf)
        v_size = len(ldf)
        vdata = vis.data
    v_filter = vdata[msr_attribute]
    total = v_filter.sum()
    v_filter = v_filter / total  # normalize by total to get ratio
    if total == 0:
        return 0
    # Generate an "Overall" Vis (TODO: This is computed multiple times for every vis, alternative is to directly access df.current_vis but we do not have guaruntee that will always be unfiltered vis (in the non-Filter action scenario))
    import copy

    unfiltered_vis = copy.copy(vis)
    # Remove filters, keep only attribute intent
    unfiltered_vis._inferred_intent = utils.get_attrs_specs(
        vis._inferred_intent)
    lux.config.executor.execute([unfiltered_vis], ldf)
    if exclude_nan:
        uv = unfiltered_vis.data.dropna()
    else:
        uv = unfiltered_vis.data
    v = uv[msr_attribute]
    v = v / v.sum()
    assert len(v) == len(
        v_filter), "Data for filtered and unfiltered vis have unequal length."
    sig = v_filter_size / v_size  # significance factor
    # Euclidean distance as L2 function

    rankSig = 1  # category measure value ranking significance factor
    # if the vis is a barchart, count how many categories' rank, based on measure value, changes after the filter is applied
    if vis.mark == "bar":
        dimList = vis.get_attr_by_data_model("dimension")

        # use Pandas rank function to calculate rank positions for each category
        v_rank = uv.rank()
        v_filter_rank = vdata.rank()
        # go through and count the number of ranking changes between the filtered and unfiltered data
        numCategories = ldf.cardinality[dimList[0].attribute]
        for r in range(0, numCategories - 1):
            if v_rank[msr_attribute][r] != v_filter_rank[msr_attribute][r]:
                rankSig += 1
        # normalize ranking significance factor
        rankSig = rankSig / numCategories

    from scipy.spatial.distance import euclidean

    return sig * rankSig * euclidean(v, v_filter)