Ejemplo n.º 1
0
def string_indexer(df: pd.DataFrame, dom_expr: Expr, new_column_name: str):
    de: Any = dom_expr._expr
    column_name = de.args[0].attr
    if new_column_name is None:
        new_column_name = column_name

    if _is_pandas_df(df):
        sorted_indices = df[column_name].value_counts().index
        df[new_column_name] = df[column_name].map(
            dict(zip(sorted_indices, range(0, len(sorted_indices)))))
        if new_column_name != column_name:
            del df[column_name]
    elif spark_installed and _is_spark_df(df):
        df = df.withColumnRenamed(
            column_name, "newColName"
        )  # renaming because inputCol and outputCol can't be the same.
        indexer = StringIndexer(inputCol="newColName",
                                outputCol=new_column_name)
        df = indexer.fit(df).transform(df)
        df = df.drop("newColName")
    else:
        raise ValueError(
            "function day_of_month supports only Pandas dataframes or spark dataframes."
        )

    return new_column_name, df
Ejemplo n.º 2
0
    def transform(self, X):
        table_name = lale.datasets.data_schemas.get_table_name(X)
        columns_to_keep = []

        def get_map_function_output(column, new_column_name):
            functions_module = importlib.import_module("lale.lib.lale.functions")
            if _is_ast_subscript(column._expr) or _is_ast_attribute(column._expr):
                function_name = "identity"
            else:
                function_name = column._expr.func.id
            map_func_to_be_called = getattr(functions_module, function_name)
            return map_func_to_be_called(X, column, new_column_name)

        if isinstance(self.columns, list):
            for column in self.columns:
                new_column_name, X = get_map_function_output(column, None)
                columns_to_keep.append(new_column_name)
        elif isinstance(self.columns, dict):
            for new_column_name, column in self.columns.items():
                new_column_name, X = get_map_function_output(column, new_column_name)
                columns_to_keep.append(new_column_name)
        else:
            raise ValueError("columns must be either a list or a dictionary.")
        mapped_df = X  # Do nothing as X already has the right columns
        if self.remainder == "drop":
            if _is_pandas_df(X):
                mapped_df = X[columns_to_keep]
            elif _is_spark_df(X):
                mapped_df = X.select(columns_to_keep)
            else:
                raise ValueError(
                    "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe."
                )
        mapped_df = lale.datasets.data_schemas.add_table_name(mapped_df, table_name)
        return mapped_df
Ejemplo n.º 3
0
def identity(df: Any, column: Expr, new_column_name: str):
    if _is_ast_subscript(column._expr):  # type: ignore
        column_name = column._expr.slice.value.s  # type: ignore
    elif _is_ast_attribute(column._expr):  # type: ignore
        column_name = column._expr.attr  # type: ignore
    else:
        raise ValueError(
            "Expression type not supported. Formats supported: it.column_name or it['column_name']."
        )

    if column_name is None or not column_name.strip():
        raise ValueError(
            "Name of the column to be renamed cannot be None or empty.")
    if new_column_name is None or not new_column_name.strip():
        raise ValueError(
            "New name of the column to be renamed cannot be None or empty.")

    if _is_pandas_df(df):
        df = df.rename(columns={column_name: new_column_name})
    elif spark_installed and _is_spark_df(df):
        df = df.withColumnRenamed(column_name, new_column_name)
    else:
        raise ValueError(
            "Function identity supports only Pandas dataframes or spark dataframes."
        )
    return new_column_name, df
Ejemplo n.º 4
0
def time_functions(df: Any, dom_expr: Expr, new_column_name: str,
                   pandas_func: str, spark_func: str):
    fmt = None
    de: Any = dom_expr._expr
    column_name = de.args[0].attr
    if new_column_name is None:
        new_column_name = column_name
    if len(de.args) > 1:
        fmt = ast.literal_eval(de.args[1])
    if _is_pandas_df(df):
        new_column = pd.to_datetime(df[column_name], format=fmt)
        df[new_column_name] = getattr(getattr(new_column, "dt"), pandas_func)
        if new_column_name != column_name:
            del df[column_name]
    elif spark_installed and _is_spark_df(df):
        df = df.withColumn(column_name, to_timestamp(df[column_name],
                                                     fmt))  # type: ignore
        df = df.select(
            eval(spark_func + "(df[column_name])").alias(new_column_name))
        if new_column_name != column_name:
            df = df.drop(column_name)
    else:
        raise ValueError(
            "function day_of_month supports only Pandas dataframes or spark dataframes."
        )

    return new_column_name, df
Ejemplo n.º 5
0
def count(df):
    if isinstance(df, np.ndarray):
        return df.size
    if _is_pandas_df(df) or _is_pandas_series(df):
        return len(df)
    elif _is_spark_df(df):
        return df.count()
    else:
        return len(df)
Ejemplo n.º 6
0
Archivo: map.py Proyecto: hirzel/lale
 def transform(self, X):
     if _is_pandas_df(X):
         return self.transform_pandas_df(X)
     elif _is_spark_df(X):
         return self.transform_spark_df(X)
     else:
         raise ValueError(
             f"Only Pandas or Spark dataframe are supported as inputs, got {type(X)}. Please check that pyspark is installed if you see this error for a Spark dataframe."
         )
Ejemplo n.º 7
0
def select_col(df, col: column_index):
    if isinstance(df, np.ndarray):
        return df[col]
    elif _is_pandas_df(df):
        return df[col]
    elif _is_spark_df(df):
        return df.select(col)
    else:
        raise ValueError(f"Unsupported series type {type(df)}")
Ejemplo n.º 8
0
def filter_isnotnull(df: Any, column_name: str):
    if _is_pandas_df(df):
        return df[df[column_name].notnull()]
    elif spark_installed and _is_spark_df(df):
        return df.filter(~pyspark.sql.functions.isnull(df[column_name]))
    else:
        raise ValueError(
            "the filter isnotnan supports only Pandas dataframes or spark dataframes."
        )
Ejemplo n.º 9
0
def get_obj_cols(df):
    """
    Returns names of 'object' columns in the DataFrame.
    """
    obj_cols = []
    if _is_pandas_df(df):
        for idx, dt in enumerate(df.dtypes):
            if dt == "object" or is_category(dt):
                obj_cols.append(df.columns.values[idx])
    elif _is_spark_df(df):
        assert False, "Not yet implemented"

    return obj_cols
Ejemplo n.º 10
0
def _is_string_df(X):
    if _is_pandas_df(X):
        return X.shape[1] == X.select_dtypes(include="object").shape[1]
    elif _is_spark_df(X):
        from pyspark.sql.types import StringType

        numeric_cols = [
            f.name for f in X.schema.fields
            if isinstance(f.dataType, StringType)
        ]
        return len(get_columns(X)) == len(numeric_cols)
    else:
        return False
Ejemplo n.º 11
0
 def split_df(self, X):
     if self.label_name not in X.columns:
         return X, None
     if _is_pandas_df(X):
         y = pd.DataFrame(X[self.label_name])
         X = X.drop(self.label_name, axis=1)
     elif _is_spark_df(X):
         y = X.select(X[self.label_name])
         X = X.drop(self.label_name)
     else:
         raise ValueError(
             "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe."
         )
     return X, y
Ejemplo n.º 12
0
def get_columns(df) -> List[column_index]:
    if _is_pandas_series(df):
        return pd.Series([df.name])
    if _is_pandas_df(df):
        return df.columns
    if _is_spark_with_index(df):
        return pd.Series(df.columns_without_index)
    if _is_spark_df(df):
        return df.columns
    if isinstance(df, np.ndarray):
        # should have more asserts here
        _, num_cols = df.shape
        return list(range(num_cols))
    assert False, type(df)
Ejemplo n.º 13
0
def _is_numeric_df(X):
    if _is_pandas_df(X):
        return X.shape[1] == X.select_dtypes(include=np.number).shape[1]
    elif _is_spark_df(X):
        from pyspark.sql.types import NumericType

        numeric_cols = [
            f.name for f in X.schema.fields
            if isinstance(f.dataType, NumericType)
        ]
        if _is_spark_with_index(X) and get_index_name(X) in numeric_cols:
            numeric_cols.remove(get_index_name(X))
        return len(get_columns(X)) == len(numeric_cols)
    else:
        return False
Ejemplo n.º 14
0
    def __call__(self, X):
        def is_categorical(column_values):
            unique_values = set()
            for val in column_values:
                if val not in unique_values:
                    unique_values.add(val)
                    if len(unique_values) > self._max_values:
                        return False
            return True

        if _is_pandas_df(X):
            result = [c for c in X.columns if is_categorical(X[c])]
        elif isinstance(X, np.ndarray):
            result = [c for c in range(X.shape[1]) if is_categorical(X[:, c])]
        else:
            raise TypeError(f"unexpected type {type(X)}")
        return result
Ejemplo n.º 15
0
 def _set_fit_attributes(self, lifted):
     # set attribute values
     self.feature_names_in_ = lifted[0]
     self.n_features_in_ = len(self.feature_names_in_)
     self._lifted_statistics = lifted[1]
     strategy = self._hyperparams["strategy"]
     if strategy == "constant":
         self.statistics_ = self._lifted_statistics.to_numpy()[0]
     elif strategy == "mean":
         self.statistics_ = (self._lifted_statistics["sum"] /
                             self._lifted_statistics["count"]).to_numpy()[0]
     else:
         agg_data = self._lifted_statistics
         if agg_data is not None and _is_spark_df(agg_data):
             agg_data = agg_data.toPandas()
         if agg_data is not None and _is_pandas_df(agg_data):
             self.statistics_ = agg_data.to_numpy()[
                 0]  # Converting from a 2-d array to 1-d
     self._transformer = None
Ejemplo n.º 16
0
    def __call__(self, X):
        def is_date_time(column_values):
            try:
                for val in column_values:
                    if isinstance(val, str):
                        datetime.datetime.strptime(val, self._fmt)
                    else:
                        return False
            except ValueError:
                return False
            return True

        if _is_pandas_df(X):
            result = [c for c in X.columns if is_date_time(X[c])]
        elif isinstance(X, np.ndarray):
            result = [c for c in range(X.shape[1]) if is_date_time(X[:, c])]
        else:
            raise TypeError(f"unexpected type {type(X)}")
        return result
Ejemplo n.º 17
0
    def transform(self, X):
        by = self.by
        orders: List[Tuple[str, bool]]
        if isinstance(by, list):
            orders = [self._get_order_key(k) for k in by]
        else:
            orders = [self._get_order_key(by)]

        cols: List[str] = [col for col, _ in orders]
        ascs: List[bool] = [asc for _, asc in orders]
        if _is_pandas_df(X):
            ordered_df = X.sort_values(by=cols, ascending=ascs)
        elif _is_spark_df(X):
            ordered_df = X.orderBy(cols, ascending=ascs)
        else:
            raise ValueError(
                "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe."
            )

        ordered_df = forward_metadata(X, ordered_df)
        return ordered_df
Ejemplo n.º 18
0
 def transform(self, X):
     group_by_keys = []
     for by_element in self.by if self.by is not None else []:
         expr_to_parse = by_element._expr
         group_by_keys.append(self._get_group_key(expr_to_parse))
     col_not_in_X = np.setdiff1d(group_by_keys, X.columns)
     if col_not_in_X.size > 0:
         raise ValueError(
             "GroupBy key columns {} not present in input dataframe X.".
             format(col_not_in_X))
     if _is_spark_df(X):
         grouped_df = X.groupby(group_by_keys)
     elif _is_pandas_df(X):
         grouped_df = X.groupby(group_by_keys, sort=False)
     else:
         raise ValueError(
             "Only pandas and spark dataframes are supported by the GroupBy operator."
         )
     named_grouped_df = lale.datasets.data_schemas.add_table_name(
         grouped_df, lale.datasets.data_schemas.get_table_name(X))
     return named_grouped_df
Ejemplo n.º 19
0
def replace(df: Any, replace_expr: Expr, new_column_name: str):
    re: Any = replace_expr._expr
    column_name = re.args[0].attr
    if new_column_name is None:
        new_column_name = column_name
    mapping_dict = ast.literal_eval(re.args[1].value)
    if _is_pandas_df(df):
        new_column = df[column_name].replace(mapping_dict)
        df[new_column_name] = new_column
        if new_column_name != column_name:
            del df[column_name]
    elif spark_installed and _is_spark_df(df):
        mapping_expr = create_map(
            [lit(x) for x in chain(*mapping_dict.items())])  # type: ignore
        df = df.withColumn(new_column_name,
                           mapping_expr[df[column_name]])  # type: ignore
        if new_column_name != column_name:
            df = df.drop(column_name)
    else:
        raise ValueError(
            "function replace supports only Pandas dataframes or spark dataframes."
        )
    return new_column_name, df
Ejemplo n.º 20
0
        def filter(X):
            if isinstance(op, ast.Name):
                # currently only handles single argument predicates
                functions_module = importlib.import_module("lale.lib.lale.functions")
                func = getattr(functions_module, "filter_" + op.id)
                return func(X, lhs)

            # Filtering spark dataframes
            if _is_spark_df(X):
                if isinstance(op, ast.Eq):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) == col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) == rhs)
                    )
                elif isinstance(op, ast.NotEq):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) != col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) != rhs)
                    )
                elif isinstance(op, ast.GtE):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) >= col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) >= rhs)
                    )
                elif isinstance(op, ast.Gt):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) > col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) > rhs)
                    )
                elif isinstance(op, ast.LtE):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) <= col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) <= rhs)
                    )
                elif isinstance(op, ast.Lt):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) < col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) < rhs)
                    )
                else:
                    raise ValueError(
                        "{} operator type found. Only ==, !=, >=, <=, >, < operators are supported".format(
                            op
                        )
                    )
            # Filtering pandas dataframes
            if _is_pandas_df(X):
                assert lhs is not None
                assert rhs is not None
                if isinstance(op, ast.Eq):
                    return (
                        X[X[lhs] == X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] == rhs]
                    )
                elif isinstance(op, ast.NotEq):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] != X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] != rhs]
                    )
                elif isinstance(op, ast.GtE):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] >= X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] >= rhs]
                    )
                elif isinstance(op, ast.Gt):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] > X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] > rhs]
                    )
                elif isinstance(op, ast.LtE):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] <= X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] <= rhs]
                    )
                elif isinstance(op, ast.Lt):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] < X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] < rhs]
                    )
                else:
                    raise ValueError(
                        "{} operator type found. Only ==, !=, >=, <=, >, < operators are supported".format(
                            op
                        )
                    )
            else:
                raise ValueError(
                    "Only pandas and spark dataframes are supported by the filter operator."
                )
Ejemplo n.º 21
0
def multitable_train_test_split(
    dataset,
    main_table_name,
    label_column_name,
    test_size=0.25,
    random_state=None,
) -> Tuple:
    """
    Splits X and y into random train and test subsets stratified by
    labels and protected attributes.

    Behaves similar to the `train_test_split`_ function from scikit-learn.

    .. _`train_test_split`: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

    Parameters
    ----------
    dataset : list of either Pandas or Spark dataframes

      Each dataframe in the list corresponds to an entity/table in the multi-table setting.

    main_table_name : string

      The name of the main table as the split is going to be based on the main table.

    label_column_name : string

      The name of the label column from the main table.

    test_size : float or int, default=0.25

      If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
      If int, represents the absolute number of test samples.

    random_state : int, RandomState instance or None, default=None

      Controls the shuffling applied to the data before applying the split.
      Pass an integer for reproducible output across multiple function calls.

      - None

          RandomState used by numpy.random

      - numpy.random.RandomState

          Use the provided random state, only affecting other users of that same random state instance.

      - integer

          Explicit seed.

    Returns
    -------
    result : tuple

      - item 0: train_X, List of datasets corresponding to the train split

      - item 1: test_X, List of datasets corresponding to the test split

      - item 2: train_y

      - item 3: test_y

    """
    main_table_df = None
    index_of_main_table = -1
    for i, df in enumerate(dataset):
        if get_table_name(df) == main_table_name:
            main_table_df = df
            index_of_main_table = i
    if main_table_df is None:
        table_names = [get_table_name(df) for df in dataset]
        raise ValueError(
            f"Could not find {main_table_name} in the given dataset, the table names are {table_names}"
        )
    if _is_pandas_df(main_table_df):
        num_rows = len(main_table_df)
    elif _is_spark_df(main_table_df):
        # main_table_df = main_table_df.toPandas()
        num_rows = main_table_df.count()
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )
    if test_size > 0 and test_size < 1:
        num_test_rows = int(num_rows * test_size)
    else:
        num_test_rows = test_size
    test_indices = random.choice(range(num_rows), num_test_rows, replace=False)
    train_indices = list(set([*range(num_rows)]) - set(test_indices.tolist()))
    assert len(test_indices) + len(train_indices) == num_rows
    train_dataset = [table for table in dataset]
    test_dataset = [table for table in dataset]
    if _is_pandas_df(main_table_df):
        train_main_df = main_table_df.iloc[train_indices]
        test_main_df = main_table_df.iloc[test_indices]
        train_y = train_main_df[label_column_name]
        test_y = test_main_df[label_column_name]
    elif _is_spark_df(main_table_df):
        spark_session = SparkSession.builder.appName(
            "multitable_train_test_split").getOrCreate()
        train_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[train_indices])
        test_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[test_indices])
        train_y = train_main_df.select(label_column_name)
        test_y = test_main_df.select(label_column_name)
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )

    train_main_df = add_table_name(train_main_df, main_table_name)
    test_main_df = add_table_name(test_main_df, main_table_name)
    train_dataset[index_of_main_table] = train_main_df
    test_dataset[index_of_main_table] = test_main_df
    return train_dataset, test_dataset, train_y, test_y
Ejemplo n.º 22
0
def _df_count(X):
    if _is_pandas_df(X):
        return len(X)
    elif _is_spark_df(X):
        return X.count()