Esempio n. 1
0
    def compute_data_type(self, ldf:LuxDataFrame):
        for attr in list(ldf.columns):
            temporal_var_list = ["month", "year","day","date","time"]
            if (isinstance(attr,pd._libs.tslibs.timestamps.Timestamp)): 
                # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05')
                ldf.data_type_lookup[attr] = "temporal"
            # elif any(var in str(attr).lower() for var in temporal_var_list):
            elif str(attr).lower() in temporal_var_list:
                ldf.data_type_lookup[attr] = "temporal"
            elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
                ldf.data_type_lookup[attr] = "quantitative"
            elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): 
                # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
                if (ldf.pre_aggregated):
                    if (ldf.cardinality[attr]==len(ldf)):
                        ldf.data_type_lookup[attr] = "nominal"
                if ldf.cardinality[attr]/len(ldf) < 0.4 and ldf.cardinality[attr]<30: 
                    ldf.data_type_lookup[attr] = "nominal"
                else:
                    ldf.data_type_lookup[attr] = "quantitative"
                if check_if_id_like(ldf,attr): 
                    ldf.data_type_lookup[attr] = "id"
            # Eliminate this clause because a single NaN value can cause the dtype to be object
            elif pd.api.types.is_object_dtype(ldf.dtypes[attr]):
                if check_if_id_like(ldf,attr): 
                    ldf.data_type_lookup[attr] = "id"
                else:
                    ldf.data_type_lookup[attr] = "nominal"
            elif is_datetime_series(ldf.dtypes[attr]): #check if attribute is any type of datetime dtype
                ldf.data_type_lookup[attr] = "temporal"
        # for attr in list(df.dtypes[df.dtypes=="int64"].keys()):
        #   if self.cardinality[attr]>50:
        if (ldf.index.dtype !='int64' and ldf.index.name):
            ldf.data_type_lookup[ldf.index.name] = "nominal"
        ldf.data_type = self.mapping(ldf.data_type_lookup)

        from pandas.api.types import is_datetime64_any_dtype as is_datetime
        non_datetime_attrs = []
        for attr in ldf.columns:
            if ldf.data_type_lookup[attr] == 'temporal' and not is_datetime(ldf[attr]):
                non_datetime_attrs.append(attr)
        if len(non_datetime_attrs) == 1:
            warnings.warn(
                    f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" 
                    "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
                    "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
                    "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
                    "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n"
                    ,stacklevel=2)
        elif len(non_datetime_attrs) > 1:
            warnings.warn(
                    f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" 
                    "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
                    "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
                    "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
                    "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n"
                    ,stacklevel=2)
Esempio n. 2
0
    def compute_data_type(self, lst: LuxSQLTable):
        """
        Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        lst: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        data_type = {}
        sql_dtypes = {}
        self.get_cardinality(lst)
        if "." in lst.table_name:
            table_name = lst.table_name[lst.table_name.index(".") + 1 :]
        else:
            table_name = lst.table_name
        # get the data types of the attributes in the SQL table
        for attr in list(lst.columns):
            datatype_query = "SELECT DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{}' AND COLUMN_NAME = '{}'".format(
                table_name, attr
            )
            datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0]

            sql_dtypes[attr] = datatype
        for attr in list(lst.columns):
            if str(attr).lower() in ["month", "year"]:
                data_type[attr] = "temporal"
            elif sql_dtypes[attr] in [
                "character",
                "character varying",
                "boolean",
                "uuid",
                "text",
            ]:
                data_type[attr] = "nominal"
            elif sql_dtypes[attr] in [
                "integer",
                "numeric",
                "decimal",
                "bigint",
                "real",
                "smallint",
                "smallserial",
                "serial",
                "double precision",
            ]:
                if lst.cardinality[attr] < 13:
                    data_type[attr] = "nominal"
                elif check_if_id_like(lst, attr):
                    lst._data_type[attr] = "id"
                else:
                    data_type[attr] = "quantitative"
            elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]:
                data_type[attr] = "temporal"
        lst._data_type = data_type
Esempio n. 3
0
    def compute_data_type(self, tbl: LuxSQLTable):
        """
        Function which the equivalent Pandas data type of each variable within the specified Lux DataFrame's SQL table.
        Populates the metadata parameters of the specified Lux DataFrame.

        Parameters
        ----------
        tbl: lux.LuxSQLTable
            lux.LuxSQLTable object whose metadata will be calculated

        Returns
        -------
        None
        """
        data_type = {}
        self.get_cardinality(tbl)
        if "." in tbl.table_name:
            table_name = tbl.table_name[tbl.table_name.index(".") + 1 :]
        else:
            table_name = tbl.table_name
        # get the data types of the attributes in the SQL table
        for attr in list(tbl.columns):
            datatype_query = lux.config.query_templates['datatype_query'].format(table_name = table_name, attribute = attr)
            datatype = list(pandas.read_sql(datatype_query, lux.config.SQLconnection)["data_type"])[0]
            if str(attr).lower() in {"month", "year"} or "time" in datatype or "date" in datatype:
                data_type[attr] = "temporal"
            elif datatype in {
                "character",
                "character varying",
                "boolean",
                "uuid",
                "text",
            }:
                data_type[attr] = "nominal"
            elif datatype in {
                "integer",
                "numeric",
                "decimal",
                "bigint",
                "real",
                "smallint",
                "smallserial",
                "serial",
                "double",
                "double precision",
            }:
                if tbl.cardinality[attr] < 13:
                    data_type[attr] = "nominal"
                elif check_if_id_like(tbl, attr):
                    data_type[attr] = "id"
                else:
                    data_type[attr] = "quantitative"

        tbl._data_type = data_type
Esempio n. 4
0
    def compute_data_type(self, ldf: LuxDataFrame):
        from pandas.api.types import is_datetime64_any_dtype as is_datetime

        for attr in list(ldf.columns):
            if attr in ldf._type_override:
                ldf._data_type[attr] = ldf._type_override[attr]
            else:
                temporal_var_list = [
                    "month", "year", "day", "date", "time", "weekday"
                ]
                if is_datetime(ldf[attr]):
                    ldf._data_type[attr] = "temporal"
                elif self._is_datetime_string(ldf[attr]):
                    ldf._data_type[attr] = "temporal"
                elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp):
                    ldf._data_type[attr] = "temporal"
                elif str(attr).lower() in temporal_var_list:
                    ldf._data_type[attr] = "temporal"
                elif self._is_datetime_number(ldf[attr]):
                    ldf._data_type[attr] = "temporal"
                elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
                    # int columns gets coerced into floats if contain NaN
                    convertible2int = pd.api.types.is_integer_dtype(
                        ldf[attr].convert_dtypes())
                    if (convertible2int and ldf.cardinality[attr] != len(ldf)
                            and ldf.cardinality[attr] < 20):
                        ldf._data_type[attr] = "nominal"
                    else:
                        ldf._data_type[attr] = "quantitative"
                elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
                    # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
                    if ldf.pre_aggregated:
                        if ldf.cardinality[attr] == len(ldf):
                            ldf._data_type[attr] = "nominal"
                    if ldf.cardinality[attr] / len(
                            ldf) < 0.4 and ldf.cardinality[attr] < 20:
                        ldf._data_type[attr] = "nominal"
                    else:
                        ldf._data_type[attr] = "quantitative"
                    if check_if_id_like(ldf, attr):
                        ldf._data_type[attr] = "id"
                # Eliminate this clause because a single NaN value can cause the dtype to be object
                elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
                    if check_if_id_like(ldf, attr):
                        ldf._data_type[attr] = "id"
                    else:
                        ldf._data_type[attr] = "nominal"
                # check if attribute is any type of datetime dtype
                elif is_datetime_series(ldf.dtypes[attr]):
                    ldf._data_type[attr] = "temporal"
                else:
                    ldf._data_type[attr] = "nominal"
        if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name:
            ldf._data_type[ldf.index.name] = "nominal"

        non_datetime_attrs = []
        for attr in ldf.columns:
            if ldf._data_type[attr] == "temporal" and not is_datetime(
                    ldf[attr]):
                non_datetime_attrs.append(attr)
        warn_msg = ""
        if len(non_datetime_attrs) == 1:
            warn_msg += f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n"
        elif len(non_datetime_attrs) > 1:
            warn_msg += f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n"
        if len(non_datetime_attrs) > 0:
            warn_msg += "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\nFor example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n\nHere is a starter template that you can use for converting the temporal fields:\n"
            for attr in non_datetime_attrs:
                warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='<replace-with-datetime-format>')\n"
            warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html"
            warn_msg += f"\nIf {attr} is not a temporal attribute, please use override Lux's automatically detected type:"
            warn_msg += f"\n\tdf.set_data_type({{'{attr}':'quantitative'}})"
            warnings.warn(warn_msg, stacklevel=2)