Ejemplo n.º 1
0
 def validate_clause(clause):
     if not ((clause.attribute and clause.attribute == "?") or
             (clause.value and clause.value == "?")):
         if isinstance(clause.attribute, list):
             for attr in clause.attribute:
                 if attr not in list(ldf.columns):
                     raise ValueError(
                         f"The input attribute {attr} does not exist in the DataFrame."
                     )
         else:
             if (clause.attribute != "Record"):
                 #we don't value check datetime since datetime can take filter values that don't exactly match the exact TimeStamp representation
                 if (clause.attribute
                         and not is_datetime_string(clause.attribute)):
                     if not clause.attribute in list(ldf.columns):
                         raise ValueError(
                             f"The input attribute {clause.attribute} does not exist in the DataFrame."
                         )
                 if (clause.value and clause.attribute
                         and clause.filter_op == "="):
                     series = ldf[clause.attribute]
                     if (not is_datetime_series(series)):
                         if isinstance(clause.value, list):
                             vals = clause.value
                         else:
                             vals = [clause.value]
                         for val in vals:
                             if (
                                     val not in series.values
                             ):  #(not series.str.contains(val).any()):
                                 raise ValueError(
                                     f"The input value {val} does not exist for the attribute {clause.attribute} for the DataFrame."
                                 )
Ejemplo n.º 2
0
    def compute_data_type(self, ldf: LuxDataFrame):
        for attr in list(ldf.columns):
            temporal_var_list = ["month", "year", "day", "date", "time"]
            if (isinstance(attr, pd._libs.tslibs.timestamps.Timestamp)):
                # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05')
                ldf.data_type_lookup[attr] = "temporal"
            # elif any(var in str(attr).lower() for var in temporal_var_list):
            elif str(attr).lower() in temporal_var_list:
                ldf.data_type_lookup[attr] = "temporal"
            elif ldf.dtypes[attr] == "float64":
                ldf.data_type_lookup[attr] = "quantitative"
            elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
                # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
                if (ldf.pre_aggregated):
                    if (ldf.cardinality[attr] == len(ldf)):
                        ldf.data_type_lookup[attr] = "nominal"
                if ldf.cardinality[attr] / len(
                        ldf) < 0.4 and ldf.cardinality[attr] < 10:
                    ldf.data_type_lookup[attr] = "nominal"
                elif check_if_id_like(ldf, attr):
                    ldf.data_type_lookup[attr] = "id"
                else:
                    ldf.data_type_lookup[attr] = "quantitative"
            # Eliminate this clause because a single NaN value can cause the dtype to be object
            elif ldf.dtypes[attr] == "object":
                ldf.data_type_lookup[attr] = "nominal"
            elif is_datetime_series(
                    ldf.dtypes[attr]
            ):  #check if attribute is any type of datetime dtype
                ldf.data_type_lookup[attr] = "temporal"
        # for attr in list(df.dtypes[df.dtypes=="int64"].keys()):
        #   if self.cardinality[attr]>50:
        if (ldf.index.dtype != 'int64' and ldf.index.name):
            ldf.data_type_lookup[ldf.index.name] = "nominal"
        ldf.data_type = self.mapping(ldf.data_type_lookup)

        from pandas.api.types import is_datetime64_any_dtype as is_datetime
        non_datetime_attrs = []
        for attr in ldf.columns:
            if ldf.data_type_lookup[attr] == 'temporal' and not is_datetime(
                    ldf[attr]):
                non_datetime_attrs.append(attr)
        if len(non_datetime_attrs) == 1:
            warnings.warn(
                f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n"
                "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
                "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
                "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
                "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n",
                stacklevel=2)
        elif len(non_datetime_attrs) > 1:
            warnings.warn(
                f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n"
                "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
                "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
                "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
                "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n",
                stacklevel=2)
Ejemplo n.º 3
0
 def validate_clause(clause):
     warn_msg = ""
     if not (clause.attribute == "?" or clause.value == "?"
             or clause.attribute == ""):
         if isinstance(clause.attribute, list):
             for attr in clause.attribute:
                 if attr not in list(ldf.columns):
                     warn_msg = (
                         f"\n- The input attribute '{attr}' does not exist in the DataFrame."
                     )
         else:
             if clause.attribute != "Record":
                 # we don't value check datetime since datetime can take filter values that don't exactly match the exact TimeStamp representation
                 if isinstance(clause.attribute,
                               str) and not is_datetime_string(
                                   clause.attribute):
                     if not clause.attribute in list(ldf.columns):
                         search_val = clause.attribute
                         match_attr = False
                         for attr, val_list in ldf.unique_values.items(
                         ):
                             if search_val in val_list:
                                 match_attr = attr
                         if match_attr:
                             warn_msg = f"\n- The input '{search_val}' looks like a value that belongs to the '{match_attr}' attribute. \n  Please specify the value fully, as something like {match_attr}={search_val}."
                         else:
                             warn_msg = f"\n- The input attribute '{clause.attribute}' does not exist in the DataFrame. \n  Please check your input intent for typos."
                 if clause.value != "" and clause.attribute != "" and clause.filter_op == "=":
                     # Skip check for NaN filter values
                     if not lux.utils.utils.like_nan(clause.value):
                         series = ldf[clause.attribute]
                         if not is_datetime_series(series):
                             if isinstance(clause.value, list):
                                 vals = clause.value
                             else:
                                 vals = [clause.value]
                             for val in vals:
                                 if (lux.config.executor.name
                                         == "PandasExecutor"
                                         and val not in series.values):
                                     warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame."
     return warn_msg
Ejemplo n.º 4
0
    def compute_data_type(self, ldf: LuxDataFrame):
        from pandas.api.types import is_datetime64_any_dtype as is_datetime

        for attr in list(ldf.columns):
            if attr in ldf._type_override:
                ldf._data_type[attr] = ldf._type_override[attr]
            else:
                temporal_var_list = [
                    "month", "year", "day", "date", "time", "weekday"
                ]
                if is_datetime(ldf[attr]):
                    ldf._data_type[attr] = "temporal"
                elif self._is_datetime_string(ldf[attr]):
                    ldf._data_type[attr] = "temporal"
                elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp):
                    ldf._data_type[attr] = "temporal"
                elif str(attr).lower() in temporal_var_list:
                    ldf._data_type[attr] = "temporal"
                elif self._is_datetime_number(ldf[attr]):
                    ldf._data_type[attr] = "temporal"
                elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
                    # int columns gets coerced into floats if contain NaN
                    convertible2int = pd.api.types.is_integer_dtype(
                        ldf[attr].convert_dtypes())
                    if (convertible2int and ldf.cardinality[attr] != len(ldf)
                            and ldf.cardinality[attr] < 20):
                        ldf._data_type[attr] = "nominal"
                    else:
                        ldf._data_type[attr] = "quantitative"
                elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
                    # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
                    if ldf.pre_aggregated:
                        if ldf.cardinality[attr] == len(ldf):
                            ldf._data_type[attr] = "nominal"
                    if ldf.cardinality[attr] / len(
                            ldf) < 0.4 and ldf.cardinality[attr] < 20:
                        ldf._data_type[attr] = "nominal"
                    else:
                        ldf._data_type[attr] = "quantitative"
                    if check_if_id_like(ldf, attr):
                        ldf._data_type[attr] = "id"
                # Eliminate this clause because a single NaN value can cause the dtype to be object
                elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
                    if check_if_id_like(ldf, attr):
                        ldf._data_type[attr] = "id"
                    else:
                        ldf._data_type[attr] = "nominal"
                # check if attribute is any type of datetime dtype
                elif is_datetime_series(ldf.dtypes[attr]):
                    ldf._data_type[attr] = "temporal"
                else:
                    ldf._data_type[attr] = "nominal"
        if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name:
            ldf._data_type[ldf.index.name] = "nominal"

        non_datetime_attrs = []
        for attr in ldf.columns:
            if ldf._data_type[attr] == "temporal" and not is_datetime(
                    ldf[attr]):
                non_datetime_attrs.append(attr)
        warn_msg = ""
        if len(non_datetime_attrs) == 1:
            warn_msg += f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n"
        elif len(non_datetime_attrs) > 1:
            warn_msg += f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n"
        if len(non_datetime_attrs) > 0:
            warn_msg += "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\nFor example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n\nHere is a starter template that you can use for converting the temporal fields:\n"
            for attr in non_datetime_attrs:
                warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='<replace-with-datetime-format>')\n"
            warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html"
            warn_msg += f"\nIf {attr} is not a temporal attribute, please use override Lux's automatically detected type:"
            warn_msg += f"\n\tdf.set_data_type({{'{attr}':'quantitative'}})"
            warnings.warn(warn_msg, stacklevel=2)