def execute_sampling(ldf: LuxDataFrame): """ Compute and cache a sample for the overall dataframe - When # of rows exceeds lux.config.sampling_start, take 75% df as sample - When # of rows exceeds lux.config.sampling_cap, cap the df at {lux.config.sampling_cap} rows lux.config.sampling_start = 100k rows lux.config.sampling_cap = 1M rows Parameters ---------- ldf : LuxDataFrame """ SAMPLE_FLAG = lux.config.sampling SAMPLE_START = lux.config.sampling_start SAMPLE_CAP = lux.config.sampling_cap SAMPLE_FRAC = 0.75 if SAMPLE_FLAG and len(ldf) > SAMPLE_CAP: if ldf._sampled is None: # memoize unfiltered sample df ldf._sampled = ldf.sample(n=SAMPLE_CAP, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is only visualizing a sample capped at {SAMPLE_CAP} rows.", priority=99, ) elif SAMPLE_FLAG and len(ldf) > SAMPLE_START: if ldf._sampled is None: # memoize unfiltered sample df ldf._sampled = ldf.sample(frac=SAMPLE_FRAC, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is visualizing a sample of {SAMPLE_FRAC}% of the dataframe ({len(ldf._sampled)} rows).", priority=99, ) else: ldf._sampled = ldf
def compute_dataset_metadata(self, ldf: LuxDataFrame): ldf.data_type_lookup = {} ldf.data_type = {} self.compute_data_type(ldf) ldf.data_model_lookup = {} ldf.data_model = {} self.compute_data_model(ldf)
def compute_data_model(self, ldf: LuxDataFrame): ldf.data_model = { "measure": ldf.data_type["quantitative"], "dimension": ldf.data_type["nominal"] + ldf.data_type["temporal"] + ldf.data_type["id"], } ldf.data_model_lookup = self.reverseMapping(ldf.data_model)
def compute_data_type(self, ldf: LuxDataFrame): for attr in list(ldf.columns): temporal_var_list = ["month", "year", "day", "date", "time"] if (isinstance(attr, pd._libs.tslibs.timestamps.Timestamp)): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') ldf.data_type_lookup[attr] = "temporal" # elif any(var in str(attr).lower() for var in temporal_var_list): elif str(attr).lower() in temporal_var_list: ldf.data_type_lookup[attr] = "temporal" elif ldf.dtypes[attr] == "float64": ldf.data_type_lookup[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if (ldf.pre_aggregated): if (ldf.cardinality[attr] == len(ldf)): ldf.data_type_lookup[attr] = "nominal" if ldf.cardinality[attr] / len( ldf) < 0.4 and ldf.cardinality[attr] < 10: ldf.data_type_lookup[attr] = "nominal" elif check_if_id_like(ldf, attr): ldf.data_type_lookup[attr] = "id" else: ldf.data_type_lookup[attr] = "quantitative" # Eliminate this clause because a single NaN value can cause the dtype to be object elif ldf.dtypes[attr] == "object": ldf.data_type_lookup[attr] = "nominal" elif is_datetime_series( ldf.dtypes[attr] ): #check if attribute is any type of datetime dtype ldf.data_type_lookup[attr] = "temporal" # for attr in list(df.dtypes[df.dtypes=="int64"].keys()): # if self.cardinality[attr]>50: if (ldf.index.dtype != 'int64' and ldf.index.name): ldf.data_type_lookup[ldf.index.name] = "nominal" ldf.data_type = self.mapping(ldf.data_type_lookup) from pandas.api.types import is_datetime64_any_dtype as is_datetime non_datetime_attrs = [] for attr in ldf.columns: if ldf.data_type_lookup[attr] == 'temporal' and not is_datetime( ldf[attr]): non_datetime_attrs.append(attr) if len(non_datetime_attrs) == 1: warnings.warn( f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", stacklevel=2) elif len(non_datetime_attrs) > 1: warnings.warn( f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", stacklevel=2)
def f(*args, **kwargs): df = LuxDataFrame(*args, **kwargs) for attr in self._metadata: # if attr in self._default_metadata: # default = self._default_metadata[attr] # else: # default = None df.__dict__[attr] = getattr(self, attr, None) return df
def compute_stats(self, ldf: LuxDataFrame): # precompute statistics ldf.unique_values = {} ldf._min_max = {} ldf.cardinality = {} ldf._length = len(ldf) for attribute in ldf.columns: if isinstance(attribute, pd._libs.tslibs.timestamps.Timestamp): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') attribute_repr = str(attribute._date_repr) else: attribute_repr = attribute ldf.unique_values[attribute_repr] = list(ldf[attribute].unique()) ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr]) if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(ldf.dtypes[attribute]): ldf._min_max[attribute_repr] = (ldf[attribute].min(),ldf[attribute].max(),) if not pd.api.types.is_integer_dtype(ldf.index): index_column_name = ldf.index.name ldf.unique_values[index_column_name] = list(ldf.index) ldf.cardinality[index_column_name] = len(ldf.index)
def compute_stats(self, ldf: LuxDataFrame): # precompute statistics ldf.unique_values = {} ldf._min_max = {} ldf.cardinality = {} for attribute in ldf.columns: if (isinstance(attribute, pd._libs.tslibs.timestamps.Timestamp)): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') attribute_repr = str(attribute._date_repr) else: attribute_repr = attribute if ldf.dtypes[ attribute] != "float64": # and not pd.api.types.is_datetime64_ns_dtype(self.dtypes[attribute]): ldf.unique_values[attribute_repr] = list( ldf[attribute].unique()) ldf.cardinality[attribute_repr] = len( ldf.unique_values[attribute]) else: ldf.cardinality[ attribute_repr] = 999 # special value for non-numeric attribute if ldf.dtypes[attribute] == "float64" or ldf.dtypes[ attribute] == "int64": ldf._min_max[attribute_repr] = (ldf[attribute].min(), ldf[attribute].max()) if (ldf.index.dtype != 'int64'): index_column_name = ldf.index.name ldf.unique_values[index_column_name] = list(ldf.index) ldf.cardinality[index_column_name] = len(ldf.index)
def execute_approx_sample(ldf: LuxDataFrame): """ Compute and cache an approximate sample of the overall dataframe for the purpose of early pruning of the visualization search space Parameters ---------- ldf : LuxDataFrame """ if ldf._approx_sample is None: if len(ldf._sampled) > lux.config.early_pruning_sample_start: ldf._approx_sample = ldf._sampled.sample( n=lux.config.early_pruning_sample_cap, random_state=1) else: ldf._approx_sample = ldf._sampled
def compile_intent(ldf: LuxDataFrame, _inferred_intent: List[Clause]) -> VisList: """ Compiles input specifications in the intent of the ldf into a collection of lux.vis objects for visualization. 1) Enumerate a collection of visualizations interested by the user to generate a vis list 2) Expand underspecified specifications(lux.Clause) for each of the generated visualizations. 3) Determine encoding properties for each vis Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. vis_collection : list[lux.vis.Vis] empty list that will be populated with specified lux.Vis objects. Returns ------- vis_collection: list[lux.Vis] vis list with compiled lux.Vis objects. """ if _inferred_intent: vis_collection = Compiler.enumerate_collection( _inferred_intent, ldf) # autofill data type/model information Compiler.populate_data_type_model(ldf, vis_collection) # remove invalid visualizations from collection if len(vis_collection) >= 1: vis_collection = Compiler.remove_all_invalid(vis_collection) for vis in vis_collection: # autofill viz related information Compiler.determine_encoding(ldf, vis) ldf._compiled = True return vis_collection
def execute_sampling(ldf: LuxDataFrame): # General Sampling for entire dataframe SAMPLE_START = 10000 SAMPLE_CAP = 30000 SAMPLE_FRAC = 0.75 if len(ldf) > SAMPLE_CAP: if (ldf._sampled is None): # memoize unfiltered sample df ldf._sampled = ldf.sample(n=SAMPLE_CAP, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is only visualizing a random sample capped at {SAMPLE_CAP} rows.", priority=99) elif len(ldf) > SAMPLE_START: if (ldf._sampled is None): # memoize unfiltered sample df ldf._sampled = ldf.sample(frac=SAMPLE_FRAC, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is only visualizing a random sample of {len(ldf._sampled)} rows.", priority=99) else: ldf._sampled = ldf
def compile_vis(ldf: LuxDataFrame, vis: Vis) -> VisList: if (vis): vis_collection = Compiler.populate_data_type_model( ldf, [vis]) # autofill data type/model information vis_collection = Compiler.remove_all_invalid( vis_collection ) # remove invalid visualizations from collection for vis in vis_collection: Compiler.determine_encoding( ldf, vis) # autofill viz related information ldf._compiled = True return vis_collection
def execute(vislist: VisList, ldf: LuxDataFrame): ''' Given a VisList, fetch the data required to render the vis. 1) Apply filters 2) Retrieve relevant attribute 3) Perform vis-related processing (aggregation, binning) 4) return a DataFrame with relevant results Parameters ---------- vislist: list[lux.Vis] vis list that contains lux.Vis objects for visualization. ldf : lux.core.frame LuxDataFrame with specified intent. Returns ------- None ''' for vis in vislist: vis._vis_data = ldf # The vis data starts off being the same as the content of the original dataframe filter_executed = PandasExecutor.execute_filter(vis) # Select relevant data based on attribute information attributes = set([]) for clause in vis._inferred_intent: if (clause.attribute): if (clause.attribute != "Record"): attributes.add(clause.attribute) # General Sampling if len(vis.data) > 10000: if (filter_executed): vis._vis_data = vis.data.sample(frac=0.75, random_state=1) else: if (ldf._sampled is None): # memoize unfiltered sample df ldf._sampled = vis.data.sample(frac=0.75, random_state=1) vis._vis_data = ldf._sampled # TODO: Add some type of cap size on Nrows ? vis._vis_data = vis.data[list(attributes)] if (vis.mark == "bar" or vis.mark == "line"): PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed) elif (vis.mark == "histogram"): PandasExecutor.execute_binning(vis) elif (vis.mark == "scatter"): if (len(vis.data) > 10000): vis._mark = "heatmap" PandasExecutor.execute_2D_binning(vis)
def _repr_html_(self): from IPython.display import display check_import_lux_widget() import luxWidget if (self.data is None): raise Exception( "No data is populated in Vis. In order to generate data required for the vis, use the 'refresh_source' function to populate the Vis with a data source (e.g., vis.refresh_source(df))." ) else: from lux.core.frame import LuxDataFrame widget = luxWidget.LuxWidget( currentVis=LuxDataFrame.current_vis_to_JSON([self]), recommendations=[], intent="", message="") display(widget)
def recommendation(self): from lux.core.frame import LuxDataFrame if self.name is None: self.name = " " ldf = LuxDataFrame(self) if self._recommendation is not None and self._recommendation == {}: ldf.maintain_metadata() ldf.maintain_recs() return ldf._recommendation
def _repr_html_(self): self._widget = None from IPython.display import display from lux.core.frame import LuxDataFrame recommendation = { "action": "Vis List", "description": "Shows a vis list defined by the intent", } recommendation["collection"] = self._collection check_import_lux_widget() import luxwidget recJSON = LuxDataFrame.rec_to_JSON([recommendation]) self._widget = luxwidget.LuxWidget(currentVis={}, recommendations=recJSON, intent="", message="") display(self._widget)
def compile_vis(ldf: LuxDataFrame, vis: Vis) -> Vis: """ Root method for compiling visualizations Parameters ---------- ldf : LuxDataFrame vis : Vis Returns ------- Vis Compiled Vis object """ if vis: # autofill data type/model information Compiler.populate_data_type_model(ldf, [vis]) # remove invalid visualizations from collection Compiler.remove_all_invalid([vis]) # autofill viz related information Compiler.determine_encoding(ldf, vis) ldf._compiled = True return vis
def f(*args, **kwargs): df = LuxDataFrame(*args, **kwargs) for attr in self._metadata: df.__dict__[attr] = getattr(self, attr, None) return df
def _ipython_display_(self): from IPython.display import display from IPython.display import clear_output import ipywidgets as widgets from lux.core.frame import LuxDataFrame series_repr = super(LuxSeries, self).__repr__() ldf = LuxDataFrame(self) # Default column name 0 causes errors if self.name is None: ldf = ldf.rename(columns={0: " "}) self._ldf = ldf try: # Ignore recommendations when Series a results of: # 1) Values of the series are of dtype objects (df.dtypes) is_dtype_series = (all( isinstance(val, np.dtype) for val in self.values) and len(self.values) != 0) # 2) Mixed type, often a result of a "row" acting as a series (df.iterrows, df.iloc[0]) # Tolerant for NaNs + 1 type mixed_dtype = len(set([type(val) for val in self.values])) > 2 if ldf._pandas_only or is_dtype_series or mixed_dtype: print(series_repr) ldf._pandas_only = False else: if not self.index.nlevels >= 2: ldf.maintain_metadata() if lux.config.default_display == "lux": self._toggle_pandas_display = False else: self._toggle_pandas_display = True # df_to_display.maintain_recs() # compute the recommendations (TODO: This can be rendered in another thread in the background to populate self._widget) ldf.maintain_recs(is_series="Series") # Observers(callback_function, listen_to_this_variable) ldf._widget.observe(ldf.remove_deleted_recs, names="deletedIndices") ldf._widget.observe(ldf.set_intent_on_click, names="selectedIntentIndex") self._widget = ldf._widget self._recommendation = ldf._recommendation # box = widgets.Box(layout=widgets.Layout(display='inline')) button = widgets.Button( description="Toggle Pandas/Lux", layout=widgets.Layout(width="140px", top="5px"), ) ldf.output = widgets.Output() # box.children = [button,output] # output.children = [button] # display(box) display(button, ldf.output) def on_button_clicked(b): with ldf.output: if b: self._toggle_pandas_display = not self._toggle_pandas_display clear_output() if self._toggle_pandas_display: print(series_repr) else: # b.layout.display = "none" display(ldf._widget) # b.layout.display = "inline-block" button.on_click(on_button_clicked) on_button_clicked(None) except (KeyboardInterrupt, SystemExit): raise except Exception: warnings.warn( "\nUnexpected error in rendering Lux widget and recommendations. " "Falling back to Pandas display.\n" "Please report the following issue on Github: https://github.com/lux-org/lux/issues \n", stacklevel=2, ) warnings.warn(traceback.format_exc()) display(self.to_pandas())
def compute_data_type(self, ldf: LuxDataFrame): from pandas.api.types import is_datetime64_any_dtype as is_datetime for attr in list(ldf.columns): if attr in ldf._type_override: ldf._data_type[attr] = ldf._type_override[attr] else: temporal_var_list = [ "month", "year", "day", "date", "time", "weekday" ] if is_datetime(ldf[attr]): ldf._data_type[attr] = "temporal" elif self._is_datetime_string(ldf[attr]): ldf._data_type[attr] = "temporal" elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): ldf._data_type[attr] = "temporal" elif str(attr).lower() in temporal_var_list: ldf._data_type[attr] = "temporal" elif self._is_datetime_number(ldf[attr]): ldf._data_type[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): # int columns gets coerced into floats if contain NaN convertible2int = pd.api.types.is_integer_dtype( ldf[attr].convert_dtypes()) if (convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20): ldf._data_type[attr] = "nominal" else: ldf._data_type[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if ldf.pre_aggregated: if ldf.cardinality[attr] == len(ldf): ldf._data_type[attr] = "nominal" if ldf.cardinality[attr] / len( ldf) < 0.4 and ldf.cardinality[attr] < 20: ldf._data_type[attr] = "nominal" else: ldf._data_type[attr] = "quantitative" if check_if_id_like(ldf, attr): ldf._data_type[attr] = "id" # Eliminate this clause because a single NaN value can cause the dtype to be object elif pd.api.types.is_string_dtype(ldf.dtypes[attr]): if check_if_id_like(ldf, attr): ldf._data_type[attr] = "id" else: ldf._data_type[attr] = "nominal" # check if attribute is any type of datetime dtype elif is_datetime_series(ldf.dtypes[attr]): ldf._data_type[attr] = "temporal" else: ldf._data_type[attr] = "nominal" if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name: ldf._data_type[ldf.index.name] = "nominal" non_datetime_attrs = [] for attr in ldf.columns: if ldf._data_type[attr] == "temporal" and not is_datetime( ldf[attr]): non_datetime_attrs.append(attr) warn_msg = "" if len(non_datetime_attrs) == 1: warn_msg += f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" elif len(non_datetime_attrs) > 1: warn_msg += f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" if len(non_datetime_attrs) > 0: warn_msg += "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\nFor example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n\nHere is a starter template that you can use for converting the temporal fields:\n" for attr in non_datetime_attrs: warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='<replace-with-datetime-format>')\n" warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html" warn_msg += f"\nIf {attr} is not a temporal attribute, please use override Lux's automatically detected type:" warn_msg += f"\n\tdf.set_data_type({{'{attr}':'quantitative'}})" warnings.warn(warn_msg, stacklevel=2)
def compute_dataset_metadata(self, ldf: LuxDataFrame): ldf._data_type = {} self.compute_data_type(ldf)
def pandas_to_lux(df): from lux.core.frame import LuxDataFrame values = df.values.tolist() ldf = LuxDataFrame(values, columns=df.columns) return ldf
def __repr__(self): from IPython.display import display from IPython.display import clear_output import ipywidgets as widgets from lux.core.frame import LuxDataFrame series_repr = super(LuxSeries, self).__repr__() ldf = LuxDataFrame(self) try: if ldf._pandas_only: print(series_repr) ldf._pandas_only = False else: if self.index.nlevels >= 2: warnings.warn( "\nLux does not currently support series " "with hierarchical indexes.\n" "Please convert the series into a flat " "table via `pandas.DataFrame.reset_index`.\n", stacklevel=2, ) print(series_repr) return "" if len(self) <= 0: warnings.warn( "\nLux can not operate on an empty series.\nPlease check your input again.\n", stacklevel=2, ) print(series_repr) return "" ldf.maintain_metadata() if lux.config.default_display == "lux": self._toggle_pandas_display = False else: self._toggle_pandas_display = True # df_to_display.maintain_recs() # compute the recommendations (TODO: This can be rendered in another thread in the background to populate self._widget) ldf.maintain_recs() # Observers(callback_function, listen_to_this_variable) ldf._widget.observe(ldf.remove_deleted_recs, names="deletedIndices") ldf._widget.observe(ldf.set_intent_on_click, names="selectedIntentIndex") if len(ldf.recommendation) > 0: # box = widgets.Box(layout=widgets.Layout(display='inline')) button = widgets.Button( description="Toggle Pandas/Lux", layout=widgets.Layout(width="140px", top="5px"), ) ldf.output = widgets.Output() # box.children = [button,output] # output.children = [button] # display(box) display(button, ldf.output) def on_button_clicked(b): with ldf.output: if b: self._toggle_pandas_display = not self._toggle_pandas_display clear_output() if self._toggle_pandas_display: print(series_repr) else: # b.layout.display = "none" display(ldf._widget) # b.layout.display = "inline-block" button.on_click(on_button_clicked) on_button_clicked(None) else: warnings.warn( "\nLux defaults to Pandas when there are no valid actions defined.", stacklevel=2, ) print(series_repr) except (KeyboardInterrupt, SystemExit): raise except: warnings.warn( "\nUnexpected error in rendering Lux widget and recommendations. " "Falling back to Pandas display.\n\n" "Please report this issue on Github: https://github.com/lux-org/lux/issues ", stacklevel=2, ) print(series_repr) return ""