def test_filter_aggregation_fillzero_aligned(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") intent = [ lux.Clause(attribute="cylinders"), lux.Clause(attribute="milespergal"), lux.Clause("origin=Japan"), ] vis = Vis(intent, tbl) result = vis.data assert result[result["cylinders"] == 5]["milespergal"].values[0] == 0 assert result[result["cylinders"] == 8]["milespergal"].values[0] == 0
def test_vis_collection_via_list_of_vis(global_var): df = pytest.olympic # change pandas dtype for the column "Year" to datetype df["Year"] = pd.to_datetime(df["Year"], format="%Y") from lux.vis.VisList import VisList from lux.vis.Vis import Vis vcLst = [] for attribute in ["Sport", "Year", "Height", "HostRegion", "SportType"]: vis = Vis([lux.Clause("Weight"), lux.Clause(attribute)]) vcLst.append(vis) vlist = VisList(vcLst, df) assert len(vlist) == 5
def test_lazy_execution(global_var): tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") intent = [ lux.Clause(attribute="horsepower", aggregation="mean"), lux.Clause(attribute="origin"), ] vis = Vis(intent) # Check data field in vis is empty before calling executor assert vis.data is None SQLExecutor.execute([vis], tbl) assert type(vis.data) == lux.core.frame.LuxDataFrame
def test_apply_nan_filter(): from lux.vis.Vis import Vis import numpy as np dataset = [ {"fully_nan": np.nan, "some_nan": 3.0, "some_nan2": np.nan}, {"fully_nan": np.nan, "some_nan": 15.0, "some_nan2": 3.0}, {"fully_nan": np.nan, "some_nan": np.nan, "some_nan2": 3.0}, {"fully_nan": np.nan, "some_nan": 7.0, "some_nan2": 0.0}, {"fully_nan": np.nan, "some_nan": 2.0, "some_nan2": 2.0}, {"fully_nan": np.nan, "some_nan": 3.0, "some_nan2": np.nan}, {"fully_nan": np.nan, "some_nan": 1.0, "some_nan2": 1.0}, {"fully_nan": np.nan, "some_nan": 1.0, "some_nan2": 1.0}, {"fully_nan": np.nan, "some_nan": 2.0, "some_nan2": 0.0}, {"fully_nan": np.nan, "some_nan": 11.0, "some_nan2": 0.0}, ] test = pd.DataFrame(dataset) vis = Vis(["some_nan", "some_nan2=nan"], test) vis._ipython_display_() assert vis.mark == "bar"
def execute_scatter(view: Vis, tbl: LuxSQLTable): """ Given a scatterplot vis and a Lux Dataframe, fetch the data required to render the vis. 1) Generate WHERE clause for the SQL query 2) Check number of datapoints to be included in the query 3) If the number of datapoints exceeds 10000, perform a random sample from the original data 4) Query datapoints needed for the scatterplot visualization 5) return a DataFrame with relevant results Parameters ---------- vislist: list[lux.Vis] vis list that contains lux.Vis objects for visualization. tbl : lux.core.frame LuxSQLTable with specified intent. Returns ------- None """ attributes = set([]) for clause in view._inferred_intent: if clause.attribute: if clause.attribute != "Record": attributes.add(clause.attribute) where_clause, filterVars = SQLExecutor.execute_filter(view) length_query = pandas.read_sql( "SELECT COUNT(1) as length FROM {} {}".format(tbl.table_name, where_clause), lux.config.SQLconnection, ) def add_quotes(var_name): return '"' + var_name + '"' required_variables = attributes | set(filterVars) required_variables = map(add_quotes, required_variables) required_variables = ",".join(required_variables) row_count = list( pandas.read_sql( f"SELECT COUNT(*) FROM {tbl.table_name} {where_clause}", lux.config.SQLconnection, )["count"] )[0] if row_count > lux.config.sampling_cap: query = f"SELECT {required_variables} FROM {tbl.table_name} {where_clause} ORDER BY random() LIMIT 10000" else: query = "SELECT {} FROM {} {}".format(required_variables, tbl.table_name, where_clause) data = pandas.read_sql(query, lux.config.SQLconnection) view._vis_data = utils.pandas_to_lux(data)
def test_autoencoding_line_chart(global_var): df = pytest.car_df # change pandas dtype for the column "Year" to datetype df["Year"] = pd.to_datetime(df["Year"], format="%Y") vis = Vis( [lux.Clause(attribute="Year"), lux.Clause(attribute="Acceleration")], df) check_attribute_on_channel(vis, "Year", "x") check_attribute_on_channel(vis, "Acceleration", "y") # Partial channel specified vis = Vis( [ lux.Clause(attribute="Year", channel="y"), lux.Clause(attribute="Acceleration"), ], df, ) check_attribute_on_channel(vis, "Year", "y") check_attribute_on_channel(vis, "Acceleration", "x") # Full channel specified vis = Vis( [ lux.Clause(attribute="Year", channel="y"), lux.Clause(attribute="Acceleration", channel="x"), ], df, ) check_attribute_on_channel(vis, "Year", "y") check_attribute_on_channel(vis, "Acceleration", "x") with pytest.raises(ValueError): # Should throw error because there should not be columns with the same channel specified df.set_intent([ lux.Clause(attribute="Year", channel="x"), lux.Clause(attribute="Acceleration", channel="x"), ])
def test_refresh_inplace(): df = pd.DataFrame( { "date": ["2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01"], "value": [10.5, 15.2, 20.3, 25.2], } ) with pytest.warns(UserWarning, match="Lux detects that the attribute 'date' may be temporal."): df._repr_html_() assert df.data_type_lookup["date"] == "temporal" from lux.vis.Vis import Vis vis = Vis(["date", "value"], df) df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d") df.maintain_metadata() assert df.data_type["temporal"][0] == "date" vis.refresh_source(df) assert vis.mark == "line" assert vis.get_attr_by_channel("x")[0].attribute == "date" assert vis.get_attr_by_channel("y")[0].attribute == "value"
def test_autoencoding_color_line_chart(): df = pd.read_csv("lux/data/car.csv") df["Year"] = pd.to_datetime( df["Year"], format="%Y") # change pandas dtype for the column "Year" to datetype intent = [ lux.Clause(attribute="Year"), lux.Clause(attribute="Acceleration"), lux.Clause(attribute="Origin"), ] vis = Vis(intent, df) check_attribute_on_channel(vis, "Year", "x") check_attribute_on_channel(vis, "Acceleration", "y") check_attribute_on_channel(vis, "Origin", "color")
def test_vis_list_custom_title_override(): df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/olympic.csv?raw=true") df["Year"] = pd.to_datetime(df["Year"], format="%Y") vcLst = [] for attribute in ["Sport", "Year", "Height", "HostRegion", "SportType"]: vis = Vis( [lux.Clause("Weight"), lux.Clause(attribute)], title="overriding dummy title", ) vcLst.append(vis) vlist = VisList(vcLst, df) for v in vlist: assert v.title == "overriding dummy title"
def test_vis_list_custom_title_override(global_var): df = pytest.olympic df["Year"] = pd.to_datetime(df["Year"], format="%Y") vcLst = [] for attribute in ["Sport", "Year", "Height", "HostRegion", "SportType"]: vis = Vis( [lux.Clause("Weight"), lux.Clause(attribute)], title="overriding dummy title", ) vcLst.append(vis) vlist = VisList(vcLst, df) for v in vlist: assert v.title == "overriding dummy title"
def execute_filter(vis: Vis): assert ( vis.data is not None ), "execute_filter assumes input vis.data is populated (if not, populate with LuxDataFrame values)" filters = utils.get_filter_specs(vis._inferred_intent) if filters: # TODO: Need to handle OR logic for filter in filters: vis._vis_data = PandasExecutor.apply_filter( vis.data, filter.attribute, filter.filter_op, filter.value) return True else: return False
def test_autoencoding_color_scatter_chart(global_var): df = pytest.car_df # change pandas dtype for the column "Year" to datetype df["Year"] = pd.to_datetime(df["Year"], format="%Y") vis = Vis( [ lux.Clause(attribute="Horsepower"), lux.Clause(attribute="Acceleration"), lux.Clause(attribute="Origin"), ], df, ) check_attribute_on_channel(vis, "Origin", "color") vis = Vis( [ lux.Clause(attribute="Horsepower"), lux.Clause(attribute="Acceleration", channel="color"), lux.Clause(attribute="Origin"), ], df, ) check_attribute_on_channel(vis, "Acceleration", "color")
def test_autoencoding_color_line_chart(global_var): lux.config.set_executor_type("Pandas") df = pytest.car_df # change pandas dtype for the column "Year" to datetype df["Year"] = pd.to_datetime(df["Year"], format="%Y") intent = [ lux.Clause(attribute="Year"), lux.Clause(attribute="Acceleration"), lux.Clause(attribute="Origin"), ] vis = Vis(intent, df) check_attribute_on_channel(vis, "Year", "x") check_attribute_on_channel(vis, "Acceleration", "y") check_attribute_on_channel(vis, "Origin", "color")
def test_vis_collection_via_list_of_vis(): url = "https://github.com/lux-org/lux-datasets/blob/master/data/olympic.csv?raw=true" df = pd.read_csv(url) # change pandas dtype for the column "Year" to datetype df["Year"] = pd.to_datetime(df["Year"], format="%Y") from lux.vis.VisList import VisList from lux.vis.Vis import Vis vcLst = [] for attribute in ["Sport", "Year", "Height", "HostRegion", "SportType"]: vis = Vis([lux.Clause("Weight"), lux.Clause(attribute)]) vcLst.append(vis) vlist = VisList(vcLst, df) assert len(vlist) == 5
def test_multi_vis(): df = pd.read_csv("lux/data/college.csv") with pytest.raises( SyntaxError, match= "The intent that you specified corresponds to more than one visualization.", ): Vis(["SATAverage", "AverageCost", "Geography=?"], df)._repr_html_() with pytest.raises( SyntaxError, match= "The intent that you specified corresponds to more than one visualization.", ): Vis(["SATAverage", "?"], df)._repr_html_() with pytest.raises( SyntaxError, match= "The intent that you specified corresponds to more than one visualization.", ): Vis(["SATAverage", "AverageCost", "Region=New England|Southeast"], df)._repr_html_()
def test_autoencoding_line_chart(global_var): # test for sql executor sql_df = lux.LuxSQLTable(table_name="cars") vis = Vis([lux.Clause(attribute="year"), lux.Clause(attribute="acceleration")], sql_df) check_attribute_on_channel(vis, "year", "x") check_attribute_on_channel(vis, "acceleration", "y") # Partial channel specified vis = Vis( [ lux.Clause(attribute="year", channel="y"), lux.Clause(attribute="acceleration"), ], sql_df, ) check_attribute_on_channel(vis, "year", "y") check_attribute_on_channel(vis, "acceleration", "x") # Full channel specified vis = Vis( [ lux.Clause(attribute="year", channel="y"), lux.Clause(attribute="acceleration", channel="x"), ], sql_df, ) check_attribute_on_channel(vis, "year", "y") check_attribute_on_channel(vis, "acceleration", "x") with pytest.raises(ValueError): # Should throw error because there should not be columns with the same channel specified sql_df.set_intent( [ lux.Clause(attribute="year", channel="x"), lux.Clause(attribute="acceleration", channel="x"), ] )
def test_sort_bar(): from lux.processor.Compiler import Compiler from lux.vis.Vis import Vis df = pd.read_csv("lux/data/car.csv") vis = Vis([ lux.Clause(attribute="Acceleration", data_model="measure", data_type="quantitative"), lux.Clause( attribute="Origin", data_model="dimension", data_type="nominal") ], df) assert vis.mark == "bar" assert vis._inferred_intent[1].sort == '' df = pd.read_csv("lux/data/car.csv") vis = Vis([ lux.Clause(attribute="Acceleration", data_model="measure", data_type="quantitative"), lux.Clause( attribute="Name", data_model="dimension", data_type="nominal") ], df) assert vis.mark == "bar" assert vis._inferred_intent[1].sort == 'ascending'
def test_autoencoding_color_scatter_chart(): df = pd.read_csv("lux/data/car.csv") df["Year"] = pd.to_datetime( df["Year"], format="%Y") # change pandas dtype for the column "Year" to datetype vis = Vis( [ lux.Clause(attribute="Horsepower"), lux.Clause(attribute="Acceleration"), lux.Clause(attribute="Origin"), ], df, ) check_attribute_on_channel(vis, "Origin", "color") vis = Vis( [ lux.Clause(attribute="Horsepower"), lux.Clause(attribute="Acceleration", channel="color"), lux.Clause(attribute="Origin"), ], df, ) check_attribute_on_channel(vis, "Acceleration", "color")
def test_lazy_execution(): connection = psycopg2.connect( "host=localhost dbname=postgres user=postgres password=lux") sql_df = lux.LuxSQLTable() lux.config.set_SQL_connection(connection) sql_df.set_SQL_table("car") intent = [ lux.Clause(attribute="Horsepower", aggregation="mean"), lux.Clause(attribute="Origin"), ] vis = Vis(intent) # Check data field in vis is empty before calling executor assert vis.data is None SQLExecutor.execute([vis], sql_df) assert type(vis.data) == lux.core.frame.LuxDataFrame
def test_filter_aggregation_fillzero_aligned(): connection = psycopg2.connect( "host=localhost dbname=postgres user=postgres password=lux") sql_df = lux.LuxSQLTable() lux.config.set_SQL_connection(connection) sql_df.set_SQL_table("car") intent = [ lux.Clause(attribute="Cylinders"), lux.Clause(attribute="MilesPerGal"), lux.Clause("Origin=Japan"), ] vis = Vis(intent, sql_df) result = vis.data assert result[result["Cylinders"] == 5]["MilesPerGal"].values[0] == 0 assert result[result["Cylinders"] == 8]["MilesPerGal"].values[0] == 0
def test_colored_bar_chart(): from lux.vis.Vis import Vis from lux.vis.Vis import Clause df = pd.read_csv("lux/data/car.csv") x_clause = Clause(attribute="MilesPerGal", channel="x") y_clause = Clause(attribute="Origin", channel="y") color_clause = Clause(attribute='Cylinders', channel="color") new_vis = Vis([x_clause, y_clause, color_clause], df) #make sure dimention of the data is correct color_cardinality = len(df.unique_values['Cylinders']) group_by_cardinality = len(df.unique_values['Origin']) assert (len(new_vis.data.columns) == 3) assert ( len(new_vis.data) == 15 > group_by_cardinality < color_cardinality * group_by_cardinality ) # Not color_cardinality*group_by_cardinality since some combinations have 0 values
def test_vis_private_properties(): from lux.vis.Vis import Vis df = pd.read_csv("lux/data/car.csv") vis = Vis(["Horsepower", "Weight"], df) vis._repr_html_() assert isinstance(vis.data, lux.core.frame.LuxDataFrame) with pytest.raises(AttributeError, match="can't set attribute"): vis.data = "some val" assert isinstance(vis.code, dict) with pytest.raises(AttributeError, match="can't set attribute"): vis.code = "some val" assert isinstance(vis.min_max, dict) with pytest.raises(AttributeError, match="can't set attribute"): vis.min_max = "some val" assert vis.mark == "scatter" with pytest.raises(AttributeError, match="can't set attribute"): vis.mark = "some val"
def test_filter_aggregation_fillzero_aligned(global_var): df = pytest.car_df intent = [ lux.Clause(attribute="Cylinders"), lux.Clause(attribute="MilesPerGal"), lux.Clause("Origin=Japan"), ] vis = Vis(intent, df) result = vis.data externalValidation = df[df["Origin"] == "Japan"].groupby( "Cylinders").mean()["MilesPerGal"] assert result[result["Cylinders"] == 5]["MilesPerGal"].values[0] == 0 assert result[result["Cylinders"] == 8]["MilesPerGal"].values[0] == 0 assert result[result["Cylinders"] == 3]["MilesPerGal"].values[0] == externalValidation[3] assert result[result["Cylinders"] == 4]["MilesPerGal"].values[0] == externalValidation[4] assert result[result["Cylinders"] == 6]["MilesPerGal"].values[0] == externalValidation[6]
def row_group(ldf): recommendation = { "action": "Row Groups", "description": "Shows charts of possible visualizations with respect to the row-wise index.", "long_description": 'A row index can be thought of as an extra row that indicates the values that the user is interested in. \ Lux focuses on visualizing named dataframe indices, i.e., indices with a non-null name property, as a proxy of the attribute \ that the user is interested in or have operated on (e.g., group-by attribute). In particular, dataframes with named indices \ are often pre-aggregated, so Lux visualizes exactly the values that the dataframe portrays. \ <a href="https://lux-api.readthedocs.io/en/latest/source/advanced/indexgroup.html" target="_blank">More details</a>', } collection = [] if ldf.index.nlevels == 1: if ldf.columns.name is not None: dim_name = ldf.columns.name else: dim_name = "index" for row_id in range(len(ldf)): row = ldf.iloc[row_id, ] rowdf = row.reset_index() # if (dim_name =="index"): #TODO: need to change this to auto-detect # rowdf.data_type_lookup["index"]="nominal" # rowdf.data_model_lookup["index"]="dimension" # rowdf.cardinality["index"]=len(rowdf) # if isinstance(ldf.columns,pd.DatetimeIndex): # rowdf.data_type_lookup[dim_name]="temporal" vis = Vis( [ dim_name, lux.Clause( row.name, data_model="measure", aggregation=None), ], rowdf, ) collection.append(vis) vlst = VisList(collection) # Note that we are not computing interestingness score here because we want to preserve the arrangement of the aggregated data recommendation["collection"] = vlst return recommendation
def test_colored_bar_chart(): from lux.vis.Vis import Vis from lux.vis.Vis import Clause tbl = lux.LuxSQLTable() tbl.set_SQL_table("cars") x_clause = Clause(attribute="milespergal", channel="x") y_clause = Clause(attribute="origin", channel="y") color_clause = Clause(attribute="cylinders", channel="color") new_vis = Vis([x_clause, y_clause, color_clause], tbl) # make sure dimention of the data is correct color_carsdinality = len(tbl.unique_values["cylinders"]) group_by_carsdinality = len(tbl.unique_values["origin"]) assert len(new_vis.data.columns) == 3 assert ( len(new_vis.data) == 15 > group_by_carsdinality < color_carsdinality * group_by_carsdinality ) # Not color_carsdinality*group_by_carsdinality since some combinations have 0 values
def test_filter_aggregation_fillzero_aligned(): df = pd.read_csv("lux/data/car.csv") intent = [ lux.Clause(attribute="Cylinders"), lux.Clause(attribute="MilesPerGal"), lux.Clause("Origin=Japan"), ] vis = Vis(intent, df) result = vis.data externalValidation = ( df[df["Origin"] == "Japan"].groupby("Cylinders").mean()["MilesPerGal"]) assert result[result["Cylinders"] == 5]["MilesPerGal"].values[0] == 0 assert result[result["Cylinders"] == 8]["MilesPerGal"].values[0] == 0 assert (result[result["Cylinders"] == 3]["MilesPerGal"].values[0] == externalValidation[3]) assert (result[result["Cylinders"] == 4]["MilesPerGal"].values[0] == externalValidation[4]) assert (result[result["Cylinders"] == 6]["MilesPerGal"].values[0] == externalValidation[6])
def test_vis_private_properties(global_var): from lux.vis.Vis import Vis df = pytest.car_df vis = Vis(["Horsepower", "Weight"], df) vis._ipython_display_() assert isinstance(vis.data, lux.core.frame.LuxDataFrame) with pytest.raises(AttributeError, match="can't set attribute"): vis.data = "some val" assert isinstance(vis.code, dict) with pytest.raises(AttributeError, match="can't set attribute"): vis.code = "some val" assert isinstance(vis.min_max, dict) with pytest.raises(AttributeError, match="can't set attribute"): vis.min_max = "some val" assert vis.mark == "scatter" with pytest.raises(AttributeError, match="can't set attribute"): vis.mark = "some val"
def column_group(ldf): recommendation = { "action": "Column Groups", "description": "Shows charts of possible visualizations with respect to the column-wise index.", } collection = [] ldf_flat = ldf if isinstance(ldf.columns, pd.DatetimeIndex): ldf_flat.columns = ldf_flat.columns.format() # use a single shared ldf_flat so that metadata doesn't need to be computed for every vis ldf_flat = ldf_flat.reset_index() if ldf.index.nlevels == 1: if ldf.index.name: index_column_name = ldf.index.name else: index_column_name = "index" if isinstance(ldf.columns, pd.DatetimeIndex): ldf.columns = ldf.columns.to_native_types() for attribute in ldf.columns: if ldf[attribute].dtype != "object" and (attribute != "index"): vis = Vis([ lux.Clause( attribute=index_column_name, data_type="nominal", data_model="dimension", aggregation=None, ), lux.Clause( attribute=str(attribute), data_type="quantitative", data_model="measure", aggregation=None, ), ]) collection.append(vis) vlst = VisList(collection, ldf_flat) # Note that we are not computing interestingness score here because we want to preserve the arrangement of the aggregated ldf recommendation["collection"] = vlst return recommendation
def test_colored_bar_chart(global_var): from lux.vis.Vis import Vis from lux.vis.Vis import Clause df = pytest.car_df x_clause = Clause(attribute="MilesPerGal", channel="x") y_clause = Clause(attribute="Origin", channel="y") color_clause = Clause(attribute="Cylinders", channel="color") new_vis = Vis([x_clause, y_clause, color_clause], df) # make sure dimention of the data is correct color_cardinality = len(df.unique_values["Cylinders"]) group_by_cardinality = len(df.unique_values["Origin"]) assert len(new_vis.data.columns) == 3 # Not color_cardinality*group_by_cardinality since some combinations have 0 values assert len( new_vis.data ) == 15 > group_by_cardinality < color_cardinality * group_by_cardinality
def column_group(ldf): recommendation = {"action":"Column Groups", "description":"Shows charts of possible visualizations with respect to the column-wise index."} collection = [] ldf_flat = ldf if isinstance(ldf.columns,pd.DatetimeIndex): ldf_flat.columns = ldf_flat.columns.format() ldf_flat = ldf_flat.reset_index() #use a single shared ldf_flat so that metadata doesn't need to be computed for every vis if (ldf.index.nlevels==1): index_column_name = ldf.index.name if isinstance(ldf.columns,pd.DatetimeIndex): ldf.columns = ldf.columns.to_native_types() for attribute in ldf.columns: vis = Vis([index_column_name,lux.Clause(str(attribute),aggregation=None)],ldf_flat) collection.append(vis) vlst = VisList(collection) # Note that we are not computing interestingness score here because we want to preserve the arrangement of the aggregated ldf recommendation["collection"] = vlst return recommendation