def compute_data_type(self, ldf:LuxDataFrame): for attr in list(ldf.columns): temporal_var_list = ["month", "year","day","date","time"] if (isinstance(attr,pd._libs.tslibs.timestamps.Timestamp)): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') ldf.data_type_lookup[attr] = "temporal" # elif any(var in str(attr).lower() for var in temporal_var_list): elif str(attr).lower() in temporal_var_list: ldf.data_type_lookup[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): ldf.data_type_lookup[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if (ldf.pre_aggregated): if (ldf.cardinality[attr]==len(ldf)): ldf.data_type_lookup[attr] = "nominal" if ldf.cardinality[attr]/len(ldf) < 0.4 and ldf.cardinality[attr]<30: ldf.data_type_lookup[attr] = "nominal" else: ldf.data_type_lookup[attr] = "quantitative" if check_if_id_like(ldf,attr): ldf.data_type_lookup[attr] = "id" # Eliminate this clause because a single NaN value can cause the dtype to be object elif pd.api.types.is_object_dtype(ldf.dtypes[attr]): if check_if_id_like(ldf,attr): ldf.data_type_lookup[attr] = "id" else: ldf.data_type_lookup[attr] = "nominal" elif is_datetime_series(ldf.dtypes[attr]): #check if attribute is any type of datetime dtype ldf.data_type_lookup[attr] = "temporal" # for attr in list(df.dtypes[df.dtypes=="int64"].keys()): # if self.cardinality[attr]>50: if (ldf.index.dtype !='int64' and ldf.index.name): ldf.data_type_lookup[ldf.index.name] = "nominal" ldf.data_type = self.mapping(ldf.data_type_lookup) from pandas.api.types import is_datetime64_any_dtype as is_datetime non_datetime_attrs = [] for attr in ldf.columns: if ldf.data_type_lookup[attr] == 'temporal' and not is_datetime(ldf[attr]): non_datetime_attrs.append(attr) if len(non_datetime_attrs) == 1: warnings.warn( f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n" ,stacklevel=2) elif len(non_datetime_attrs) > 1: warnings.warn( f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n" ,stacklevel=2)
def compute_data_type(self, lst: LuxSQLTable): """ Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- lst: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ data_type = {} sql_dtypes = {} self.get_cardinality(lst) if "." in lst.table_name: table_name = lst.table_name[lst.table_name.index(".") + 1 :] else: table_name = lst.table_name # get the data types of the attributes in the SQL table for attr in list(lst.columns): datatype_query = "SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{}' AND COLUMN_NAME = '{}'".format( table_name, attr ) datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0] sql_dtypes[attr] = datatype for attr in list(lst.columns): if str(attr).lower() in ["month", "year"]: data_type[attr] = "temporal" elif sql_dtypes[attr] in [ "character", "character varying", "boolean", "uuid", "text", ]: data_type[attr] = "nominal" elif sql_dtypes[attr] in [ "integer", "numeric", "decimal", "bigint", "real", "smallint", "smallserial", "serial", "double precision", ]: if lst.cardinality[attr] < 13: data_type[attr] = "nominal" elif check_if_id_like(lst, attr): lst._data_type[attr] = "id" else: data_type[attr] = "quantitative" elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]: data_type[attr] = "temporal" lst._data_type = data_type
def compute_data_type(self, tbl: LuxSQLTable): """ Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table. Populates the metadata parameters of the specified Lux DataFrame. Parameters ---------- tbl: lux.LuxSQLTable lux.LuxSQLTable object whose metadata will be calculated Returns ------- None """ data_type = {} self.get_cardinality(tbl) if "." in tbl.table_name: table_name = tbl.table_name[tbl.table_name.index(".") + 1 :] else: table_name = tbl.table_name # get the data types of the attributes in the SQL table for attr in list(tbl.columns): datatype_query = lux.config.query_templates['datatype_query'].format(table_name = table_name, attribute = attr) datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0] if str(attr).lower() in {"month", "year"} or "time" in datatype or "date" in datatype: data_type[attr] = "temporal" elif datatype in { "character", "character varying", "boolean", "uuid", "text", }: data_type[attr] = "nominal" elif datatype in { "integer", "numeric", "decimal", "bigint", "real", "smallint", "smallserial", "serial", "double", "double precision", }: if tbl.cardinality[attr] < 13: data_type[attr] = "nominal" elif check_if_id_like(tbl, attr): data_type[attr] = "id" else: data_type[attr] = "quantitative" tbl._data_type = data_type
def compute_data_type(self, ldf: LuxDataFrame): from pandas.api.types import is_datetime64_any_dtype as is_datetime for attr in list(ldf.columns): if attr in ldf._type_override: ldf._data_type[attr] = ldf._type_override[attr] else: temporal_var_list = [ "month", "year", "day", "date", "time", "weekday" ] if is_datetime(ldf[attr]): ldf._data_type[attr] = "temporal" elif self._is_datetime_string(ldf[attr]): ldf._data_type[attr] = "temporal" elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): ldf._data_type[attr] = "temporal" elif str(attr).lower() in temporal_var_list: ldf._data_type[attr] = "temporal" elif self._is_datetime_number(ldf[attr]): ldf._data_type[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): # int columns gets coerced into floats if contain NaN convertible2int = pd.api.types.is_integer_dtype( ldf[attr].convert_dtypes()) if (convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20): ldf._data_type[attr] = "nominal" else: ldf._data_type[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if ldf.pre_aggregated: if ldf.cardinality[attr] == len(ldf): ldf._data_type[attr] = "nominal" if ldf.cardinality[attr] / len( ldf) < 0.4 and ldf.cardinality[attr] < 20: ldf._data_type[attr] = "nominal" else: ldf._data_type[attr] = "quantitative" if check_if_id_like(ldf, attr): ldf._data_type[attr] = "id" # Eliminate this clause because a single NaN value can cause the dtype to be object elif pd.api.types.is_string_dtype(ldf.dtypes[attr]): if check_if_id_like(ldf, attr): ldf._data_type[attr] = "id" else: ldf._data_type[attr] = "nominal" # check if attribute is any type of datetime dtype elif is_datetime_series(ldf.dtypes[attr]): ldf._data_type[attr] = "temporal" else: ldf._data_type[attr] = "nominal" if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name: ldf._data_type[ldf.index.name] = "nominal" non_datetime_attrs = [] for attr in ldf.columns: if ldf._data_type[attr] == "temporal" and not is_datetime( ldf[attr]): non_datetime_attrs.append(attr) warn_msg = "" if len(non_datetime_attrs) == 1: warn_msg += f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" elif len(non_datetime_attrs) > 1: warn_msg += f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" if len(non_datetime_attrs) > 0: warn_msg += "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\nFor example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n\nHere is a starter template that you can use for converting the temporal fields:\n" for attr in non_datetime_attrs: warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='<replace-with-datetime-format>')\n" warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html" warn_msg += f"\nIf {attr} is not a temporal attribute, please use override Lux's automatically detected type:" warn_msg += f"\n\tdf.set_data_type({{'{attr}':'quantitative'}})" warnings.warn(warn_msg, stacklevel=2)