def validate_clause(clause): if not ((clause.attribute and clause.attribute == "?") or (clause.value and clause.value == "?")): if isinstance(clause.attribute, list): for attr in clause.attribute: if attr not in list(ldf.columns): raise ValueError( f"The input attribute {attr} does not exist in the DataFrame." ) else: if (clause.attribute != "Record"): #we don't value check datetime since datetime can take filter values that don't exactly match the exact TimeStamp representation if (clause.attribute and not is_datetime_string(clause.attribute)): if not clause.attribute in list(ldf.columns): raise ValueError( f"The input attribute {clause.attribute} does not exist in the DataFrame." ) if (clause.value and clause.attribute and clause.filter_op == "="): series = ldf[clause.attribute] if (not is_datetime_series(series)): if isinstance(clause.value, list): vals = clause.value else: vals = [clause.value] for val in vals: if ( val not in series.values ): #(not series.str.contains(val).any()): raise ValueError( f"The input value {val} does not exist for the attribute {clause.attribute} for the DataFrame." )
def compute_data_type(self, ldf: LuxDataFrame): for attr in list(ldf.columns): temporal_var_list = ["month", "year", "day", "date", "time"] if (isinstance(attr, pd._libs.tslibs.timestamps.Timestamp)): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') ldf.data_type_lookup[attr] = "temporal" # elif any(var in str(attr).lower() for var in temporal_var_list): elif str(attr).lower() in temporal_var_list: ldf.data_type_lookup[attr] = "temporal" elif ldf.dtypes[attr] == "float64": ldf.data_type_lookup[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if (ldf.pre_aggregated): if (ldf.cardinality[attr] == len(ldf)): ldf.data_type_lookup[attr] = "nominal" if ldf.cardinality[attr] / len( ldf) < 0.4 and ldf.cardinality[attr] < 10: ldf.data_type_lookup[attr] = "nominal" elif check_if_id_like(ldf, attr): ldf.data_type_lookup[attr] = "id" else: ldf.data_type_lookup[attr] = "quantitative" # Eliminate this clause because a single NaN value can cause the dtype to be object elif ldf.dtypes[attr] == "object": ldf.data_type_lookup[attr] = "nominal" elif is_datetime_series( ldf.dtypes[attr] ): #check if attribute is any type of datetime dtype ldf.data_type_lookup[attr] = "temporal" # for attr in list(df.dtypes[df.dtypes=="int64"].keys()): # if self.cardinality[attr]>50: if (ldf.index.dtype != 'int64' and ldf.index.name): ldf.data_type_lookup[ldf.index.name] = "nominal" ldf.data_type = self.mapping(ldf.data_type_lookup) from pandas.api.types import is_datetime64_any_dtype as is_datetime non_datetime_attrs = [] for attr in ldf.columns: if ldf.data_type_lookup[attr] == 'temporal' and not is_datetime( ldf[attr]): non_datetime_attrs.append(attr) if len(non_datetime_attrs) == 1: warnings.warn( f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", stacklevel=2) elif len(non_datetime_attrs) > 1: warnings.warn( f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", stacklevel=2)
def validate_clause(clause): warn_msg = "" if not (clause.attribute == "?" or clause.value == "?" or clause.attribute == ""): if isinstance(clause.attribute, list): for attr in clause.attribute: if attr not in list(ldf.columns): warn_msg = ( f"\n- The input attribute '{attr}' does not exist in the DataFrame." ) else: if clause.attribute != "Record": # we don't value check datetime since datetime can take filter values that don't exactly match the exact TimeStamp representation if isinstance(clause.attribute, str) and not is_datetime_string( clause.attribute): if not clause.attribute in list(ldf.columns): search_val = clause.attribute match_attr = False for attr, val_list in ldf.unique_values.items( ): if search_val in val_list: match_attr = attr if match_attr: warn_msg = f"\n- The input '{search_val}' looks like a value that belongs to the '{match_attr}' attribute. \n Please specify the value fully, as something like {match_attr}={search_val}." else: warn_msg = f"\n- The input attribute '{clause.attribute}' does not exist in the DataFrame. \n Please check your input intent for typos." if clause.value != "" and clause.attribute != "" and clause.filter_op == "=": # Skip check for NaN filter values if not lux.utils.utils.like_nan(clause.value): series = ldf[clause.attribute] if not is_datetime_series(series): if isinstance(clause.value, list): vals = clause.value else: vals = [clause.value] for val in vals: if (lux.config.executor.name == "PandasExecutor" and val not in series.values): warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame." return warn_msg
def compute_data_type(self, ldf: LuxDataFrame): from pandas.api.types import is_datetime64_any_dtype as is_datetime for attr in list(ldf.columns): if attr in ldf._type_override: ldf._data_type[attr] = ldf._type_override[attr] else: temporal_var_list = [ "month", "year", "day", "date", "time", "weekday" ] if is_datetime(ldf[attr]): ldf._data_type[attr] = "temporal" elif self._is_datetime_string(ldf[attr]): ldf._data_type[attr] = "temporal" elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): ldf._data_type[attr] = "temporal" elif str(attr).lower() in temporal_var_list: ldf._data_type[attr] = "temporal" elif self._is_datetime_number(ldf[attr]): ldf._data_type[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): # int columns gets coerced into floats if contain NaN convertible2int = pd.api.types.is_integer_dtype( ldf[attr].convert_dtypes()) if (convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20): ldf._data_type[attr] = "nominal" else: ldf._data_type[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if ldf.pre_aggregated: if ldf.cardinality[attr] == len(ldf): ldf._data_type[attr] = "nominal" if ldf.cardinality[attr] / len( ldf) < 0.4 and ldf.cardinality[attr] < 20: ldf._data_type[attr] = "nominal" else: ldf._data_type[attr] = "quantitative" if check_if_id_like(ldf, attr): ldf._data_type[attr] = "id" # Eliminate this clause because a single NaN value can cause the dtype to be object elif pd.api.types.is_string_dtype(ldf.dtypes[attr]): if check_if_id_like(ldf, attr): ldf._data_type[attr] = "id" else: ldf._data_type[attr] = "nominal" # check if attribute is any type of datetime dtype elif is_datetime_series(ldf.dtypes[attr]): ldf._data_type[attr] = "temporal" else: ldf._data_type[attr] = "nominal" if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name: ldf._data_type[ldf.index.name] = "nominal" non_datetime_attrs = [] for attr in ldf.columns: if ldf._data_type[attr] == "temporal" and not is_datetime( ldf[attr]): non_datetime_attrs.append(attr) warn_msg = "" if len(non_datetime_attrs) == 1: warn_msg += f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" elif len(non_datetime_attrs) > 1: warn_msg += f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" if len(non_datetime_attrs) > 0: warn_msg += "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\nFor example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n\nHere is a starter template that you can use for converting the temporal fields:\n" for attr in non_datetime_attrs: warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='<replace-with-datetime-format>')\n" warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html" warn_msg += f"\nIf {attr} is not a temporal attribute, please use override Lux's automatically detected type:" warn_msg += f"\n\tdf.set_data_type({{'{attr}':'quantitative'}})" warnings.warn(warn_msg, stacklevel=2)