Exemple #1
0
        def join_df(left_df, right_df):

            # Joining spark dataframes
            if _is_spark_df(left_df) and _is_spark_df(right_df):
                on = []
                drop_col = []
                left_table = left_df.alias("left_table")
                right_table = right_df.alias("right_table")

                for k, key in enumerate(left_key_col):
                    on.append(
                        col("{}.{}".format("left_table", key)).eqNullSafe(
                            col("{}.{}".format("right_table",
                                               right_key_col[k]))))
                    if key == right_key_col[k]:
                        drop_col.append(key)
                op_df = left_table.join(right_table, on, self.join_type)
                for key in drop_col:
                    op_df = op_df.drop(getattr(right_table, key))
                return op_df

            # Joining pandas dataframes
            op_df = pd.merge(
                left_df,
                right_df,
                how=self.join_type,
                left_on=left_key_col,
                right_on=right_key_col,
            )
            return op_df
Exemple #2
0
def make_series_concat(df1, df2):
    if isinstance(df1, np.ndarray):
        assert isinstance(df2, np.ndarray)
        return np.concatenate((df1, df2))
    elif isinstance(df1, pd.Series):
        assert isinstance(df2, pd.Series)
        return pd.concat([df1, df2])
    elif _is_spark_df(df1):
        assert _is_spark_df(df2)
        return df1.union(df2)
    else:
        raise ValueError(f"Unsupported series type {type(df1)}")
Exemple #3
0
    def transform(self, X):
        table_name = lale.datasets.data_schemas.get_table_name(X)
        columns_to_keep = []

        def get_map_function_output(column, new_column_name):
            functions_module = importlib.import_module("lale.lib.lale.functions")
            if _is_ast_subscript(column._expr) or _is_ast_attribute(column._expr):
                function_name = "identity"
            else:
                function_name = column._expr.func.id
            map_func_to_be_called = getattr(functions_module, function_name)
            return map_func_to_be_called(X, column, new_column_name)

        if isinstance(self.columns, list):
            for column in self.columns:
                new_column_name, X = get_map_function_output(column, None)
                columns_to_keep.append(new_column_name)
        elif isinstance(self.columns, dict):
            for new_column_name, column in self.columns.items():
                new_column_name, X = get_map_function_output(column, new_column_name)
                columns_to_keep.append(new_column_name)
        else:
            raise ValueError("columns must be either a list or a dictionary.")
        mapped_df = X  # Do nothing as X already has the right columns
        if self.remainder == "drop":
            if _is_pandas_df(X):
                mapped_df = X[columns_to_keep]
            elif _is_spark_df(X):
                mapped_df = X.select(columns_to_keep)
            else:
                raise ValueError(
                    "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe."
                )
        mapped_df = lale.datasets.data_schemas.add_table_name(mapped_df, table_name)
        return mapped_df
Exemple #4
0
def identity(df: Any, column: Expr, new_column_name: str):
    if _is_ast_subscript(column._expr):  # type: ignore
        column_name = column._expr.slice.value.s  # type: ignore
    elif _is_ast_attribute(column._expr):  # type: ignore
        column_name = column._expr.attr  # type: ignore
    else:
        raise ValueError(
            "Expression type not supported. Formats supported: it.column_name or it['column_name']."
        )

    if column_name is None or not column_name.strip():
        raise ValueError(
            "Name of the column to be renamed cannot be None or empty.")
    if new_column_name is None or not new_column_name.strip():
        raise ValueError(
            "New name of the column to be renamed cannot be None or empty.")

    if _is_pandas_df(df):
        df = df.rename(columns={column_name: new_column_name})
    elif spark_installed and _is_spark_df(df):
        df = df.withColumnRenamed(column_name, new_column_name)
    else:
        raise ValueError(
            "Function identity supports only Pandas dataframes or spark dataframes."
        )
    return new_column_name, df
Exemple #5
0
def time_functions(df: Any, dom_expr: Expr, new_column_name: str,
                   pandas_func: str, spark_func: str):
    fmt = None
    de: Any = dom_expr._expr
    column_name = de.args[0].attr
    if new_column_name is None:
        new_column_name = column_name
    if len(de.args) > 1:
        fmt = ast.literal_eval(de.args[1])
    if _is_pandas_df(df):
        new_column = pd.to_datetime(df[column_name], format=fmt)
        df[new_column_name] = getattr(getattr(new_column, "dt"), pandas_func)
        if new_column_name != column_name:
            del df[column_name]
    elif spark_installed and _is_spark_df(df):
        df = df.withColumn(column_name, to_timestamp(df[column_name],
                                                     fmt))  # type: ignore
        df = df.select(
            eval(spark_func + "(df[column_name])").alias(new_column_name))
        if new_column_name != column_name:
            df = df.drop(column_name)
    else:
        raise ValueError(
            "function day_of_month supports only Pandas dataframes or spark dataframes."
        )

    return new_column_name, df
Exemple #6
0
def string_indexer(df: pd.DataFrame, dom_expr: Expr, new_column_name: str):
    de: Any = dom_expr._expr
    column_name = de.args[0].attr
    if new_column_name is None:
        new_column_name = column_name

    if _is_pandas_df(df):
        sorted_indices = df[column_name].value_counts().index
        df[new_column_name] = df[column_name].map(
            dict(zip(sorted_indices, range(0, len(sorted_indices)))))
        if new_column_name != column_name:
            del df[column_name]
    elif spark_installed and _is_spark_df(df):
        df = df.withColumnRenamed(
            column_name, "newColName"
        )  # renaming because inputCol and outputCol can't be the same.
        indexer = StringIndexer(inputCol="newColName",
                                outputCol=new_column_name)
        df = indexer.fit(df).transform(df)
        df = df.drop("newColName")
    else:
        raise ValueError(
            "function day_of_month supports only Pandas dataframes or spark dataframes."
        )

    return new_column_name, df
Exemple #7
0
def to_schema(obj) -> JSON_TYPE:
    result = None
    if obj is None:
        result = {"enum": [None]}
    elif isinstance(obj, np.ndarray):
        result = ndarray_to_schema(obj)
    elif isinstance(obj, scipy.sparse.csr_matrix):
        result = csr_matrix_to_schema(obj)
    elif isinstance(obj, pd.DataFrame):
        result = dataframe_to_schema(obj)
    elif isinstance(obj, pd.Series):
        result = series_to_schema(obj)
    elif torch_installed and isinstance(obj, torch.Tensor):
        result = torch_tensor_to_schema(obj)
    elif is_liac_arff(obj):
        result = liac_arff_to_schema(obj)
    elif lale.type_checking.is_schema(obj):
        result = obj
    elif isinstance(obj, list):
        result = list_tensor_to_schema(obj)
    elif _is_spark_df(obj):
        result = dataframe_to_schema(obj.toPandas())
    if result is None:
        raise ValueError(f"to_schema(obj), type {type(obj)}, value {obj}")
    lale.type_checking.validate_is_schema(result)
    return result
Exemple #8
0
def filter_isnotnull(df: Any, column_name: str):
    if _is_pandas_df(df):
        return df[df[column_name].notnull()]
    elif spark_installed and _is_spark_df(df):
        return df.filter(~pyspark.sql.functions.isnull(df[column_name]))
    else:
        raise ValueError(
            "the filter isnotnan supports only Pandas dataframes or spark dataframes."
        )
Exemple #9
0
def count(df):
    if isinstance(df, np.ndarray):
        return df.size
    if _is_pandas_df(df) or _is_pandas_series(df):
        return len(df)
    elif _is_spark_df(df):
        return df.count()
    else:
        return len(df)
Exemple #10
0
 def transform(self, X):
     if _is_pandas_df(X):
         return self.transform_pandas_df(X)
     elif _is_spark_df(X):
         return self.transform_spark_df(X)
     else:
         raise ValueError(
             f"Only Pandas or Spark dataframe are supported as inputs, got {type(X)}. Please check that pyspark is installed if you see this error for a Spark dataframe."
         )
Exemple #11
0
def make_series_distinct(df):
    if isinstance(df, np.ndarray):
        return np.unique(df)
    elif isinstance(df, pd.Series):
        return df.unique()
    elif _is_spark_df(df):
        return df.distinct()
    else:
        raise ValueError(f"Unsupported series type {type(df)}")
Exemple #12
0
def select_col(df, col: column_index):
    if isinstance(df, np.ndarray):
        return df[col]
    elif _is_pandas_df(df):
        return df[col]
    elif _is_spark_df(df):
        return df.select(col)
    else:
        raise ValueError(f"Unsupported series type {type(df)}")
Exemple #13
0
def get_obj_cols(df):
    """
    Returns names of 'object' columns in the DataFrame.
    """
    obj_cols = []
    if _is_pandas_df(df):
        for idx, dt in enumerate(df.dtypes):
            if dt == "object" or is_category(dt):
                obj_cols.append(df.columns.values[idx])
    elif _is_spark_df(df):
        assert False, "Not yet implemented"

    return obj_cols
Exemple #14
0
def _is_string_df(X):
    if _is_pandas_df(X):
        return X.shape[1] == X.select_dtypes(include="object").shape[1]
    elif _is_spark_df(X):
        from pyspark.sql.types import StringType

        numeric_cols = [
            f.name for f in X.schema.fields
            if isinstance(f.dataType, StringType)
        ]
        return len(get_columns(X)) == len(numeric_cols)
    else:
        return False
Exemple #15
0
def get_columns(df) -> List[column_index]:
    if _is_pandas_series(df):
        return pd.Series([df.name])
    if _is_pandas_df(df):
        return df.columns
    if _is_spark_with_index(df):
        return pd.Series(df.columns_without_index)
    if _is_spark_df(df):
        return df.columns
    if isinstance(df, np.ndarray):
        # should have more asserts here
        _, num_cols = df.shape
        return list(range(num_cols))
    assert False, type(df)
Exemple #16
0
 def split_df(self, X):
     if self.label_name not in X.columns:
         return X, None
     if _is_pandas_df(X):
         y = pd.DataFrame(X[self.label_name])
         X = X.drop(self.label_name, axis=1)
     elif _is_spark_df(X):
         y = X.select(X[self.label_name])
         X = X.drop(self.label_name)
     else:
         raise ValueError(
             "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe."
         )
     return X, y
Exemple #17
0
 def _lift(X, hyperparams):
     feature_names_in_ = get_columns(X)
     strategy = hyperparams["strategy"]
     if strategy == "constant":
         fill_value = _SimpleImputerImpl._get_fill_value(X, hyperparams)
         agg_data = [[fill_value for col in get_columns(X)]]
         lifted_statistics = pd.DataFrame(agg_data, columns=get_columns(X))
     elif strategy == "mean":
         agg_op_sum = Aggregate(
             columns={c: sum(it[c])
                      for c in get_columns(X)},
             exclude_value=hyperparams["missing_values"],
         )
         agg_op_count = Aggregate(
             columns={c: count(it[c])
                      for c in get_columns(X)},
             exclude_value=hyperparams["missing_values"],
         )
         lifted_statistics = {}
         agg_sum = agg_op_sum.transform(X)
         if agg_sum is not None and _is_spark_df(agg_sum):
             agg_sum = agg_sum.toPandas()
         agg_count = agg_op_count.transform(X)
         if agg_count is not None and _is_spark_df(agg_count):
             agg_count = agg_count.toPandas()
         lifted_statistics["sum"] = agg_sum
         lifted_statistics["count"] = agg_count
     else:
         raise ValueError(
             "_lift is only supported for imputation strategy `mean` and `constant`."
         )
     return (
         feature_names_in_,
         lifted_statistics,
         strategy,
     )  # strategy is added so that _combine can use it
Exemple #18
0
def _is_numeric_df(X):
    if _is_pandas_df(X):
        return X.shape[1] == X.select_dtypes(include=np.number).shape[1]
    elif _is_spark_df(X):
        from pyspark.sql.types import NumericType

        numeric_cols = [
            f.name for f in X.schema.fields
            if isinstance(f.dataType, NumericType)
        ]
        if _is_spark_with_index(X) and get_index_name(X) in numeric_cols:
            numeric_cols.remove(get_index_name(X))
        return len(get_columns(X)) == len(numeric_cols)
    else:
        return False
Exemple #19
0
 def _set_fit_attributes(self, lifted):
     # set attribute values
     self.feature_names_in_ = lifted[0]
     self.n_features_in_ = len(self.feature_names_in_)
     self._lifted_statistics = lifted[1]
     strategy = self._hyperparams["strategy"]
     if strategy == "constant":
         self.statistics_ = self._lifted_statistics.to_numpy()[0]
     elif strategy == "mean":
         self.statistics_ = (self._lifted_statistics["sum"] /
                             self._lifted_statistics["count"]).to_numpy()[0]
     else:
         agg_data = self._lifted_statistics
         if agg_data is not None and _is_spark_df(agg_data):
             agg_data = agg_data.toPandas()
         if agg_data is not None and _is_pandas_df(agg_data):
             self.statistics_ = agg_data.to_numpy()[
                 0]  # Converting from a 2-d array to 1-d
     self._transformer = None
Exemple #20
0
 def _lift(X, hyperparams):
     agg = {f"{c}_min": agg_min(it[c]) for c in X.columns}
     agg.update({f"{c}_max": agg_max(it[c]) for c in X.columns})
     aggregate = Aggregate(columns=agg)
     data_min_max = aggregate.transform(X)
     if _is_spark_df(X):
         data_min_max = data_min_max.toPandas()
     n = len(X.columns)
     data_min_ = np.zeros(shape=(n))
     data_max_ = np.zeros(shape=(n))
     for i, c in enumerate(X.columns):
         data_min_[i] = data_min_max[f"{c}_min"]
         data_max_[i] = data_min_max[f"{c}_max"]
     data_min_ = np.array(data_min_)
     data_max_ = np.array(data_max_)
     n_samples_seen_ = _df_count(X)
     n_features_in_ = len(X.columns)
     feature_names_in_ = X.columns
     return data_min_, data_max_, n_samples_seen_, n_features_in_, feature_names_in_
Exemple #21
0
 def transform(self, X):
     group_by_keys = []
     for by_element in self.by if self.by is not None else []:
         expr_to_parse = by_element._expr
         group_by_keys.append(self._get_group_key(expr_to_parse))
     col_not_in_X = np.setdiff1d(group_by_keys, X.columns)
     if col_not_in_X.size > 0:
         raise ValueError(
             "GroupBy key columns {} not present in input dataframe X.".
             format(col_not_in_X))
     if _is_spark_df(X):
         grouped_df = X.groupby(group_by_keys)
     elif _is_pandas_df(X):
         grouped_df = X.groupby(group_by_keys, sort=False)
     else:
         raise ValueError(
             "Only pandas and spark dataframes are supported by the GroupBy operator."
         )
     named_grouped_df = lale.datasets.data_schemas.add_table_name(
         grouped_df, lale.datasets.data_schemas.get_table_name(X))
     return named_grouped_df
Exemple #22
0
    def transform(self, X):
        by = self.by
        orders: List[Tuple[str, bool]]
        if isinstance(by, list):
            orders = [self._get_order_key(k) for k in by]
        else:
            orders = [self._get_order_key(by)]

        cols: List[str] = [col for col, _ in orders]
        ascs: List[bool] = [asc for _, asc in orders]
        if _is_pandas_df(X):
            ordered_df = X.sort_values(by=cols, ascending=ascs)
        elif _is_spark_df(X):
            ordered_df = X.orderBy(cols, ascending=ascs)
        else:
            raise ValueError(
                "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe."
            )

        ordered_df = forward_metadata(X, ordered_df)
        return ordered_df
Exemple #23
0
def replace(df: Any, replace_expr: Expr, new_column_name: str):
    re: Any = replace_expr._expr
    column_name = re.args[0].attr
    if new_column_name is None:
        new_column_name = column_name
    mapping_dict = ast.literal_eval(re.args[1].value)
    if _is_pandas_df(df):
        new_column = df[column_name].replace(mapping_dict)
        df[new_column_name] = new_column
        if new_column_name != column_name:
            del df[column_name]
    elif spark_installed and _is_spark_df(df):
        mapping_expr = create_map(
            [lit(x) for x in chain(*mapping_dict.items())])  # type: ignore
        df = df.withColumn(new_column_name,
                           mapping_expr[df[column_name]])  # type: ignore
        if new_column_name != column_name:
            df = df.drop(column_name)
    else:
        raise ValueError(
            "function replace supports only Pandas dataframes or spark dataframes."
        )
    return new_column_name, df
Exemple #24
0
 def to_monoid(self, v) -> _MinMaxScalerMonoid:
     X, _ = v
     agg = {f"{c}_min": agg_min(it[c]) for c in get_columns(X)}
     agg.update({f"{c}_max": agg_max(it[c]) for c in get_columns(X)})
     aggregate = Aggregate(columns=agg)
     data_min_max = aggregate.transform(X)
     if _is_spark_df(X):
         data_min_max = data_min_max.toPandas()
     n = len(get_columns(X))
     data_min_ = np.zeros(shape=(n))
     data_max_ = np.zeros(shape=(n))
     for i, c in enumerate(get_columns(X)):
         data_min_[i] = data_min_max[f"{c}_min"]
         data_max_[i] = data_min_max[f"{c}_max"]
     data_min_ = np.array(data_min_)
     data_max_ = np.array(data_max_)
     n_samples_seen_ = count(X)
     feature_names_in_ = get_columns(X)
     return _MinMaxScalerMonoid(
         data_min_=data_min_,
         data_max_=data_max_,
         n_samples_seen_=n_samples_seen_,
         feature_names_in_=feature_names_in_,
     )
Exemple #25
0
 def __call__(self, X, y=None):
     if not _is_spark_df(X):
         logger.warning(
             f"SparkExplain called with non spark data of type {type(X)}")
     else:
         X.explain(extended=self._extended, mode=self._mode)
Exemple #26
0
def multitable_train_test_split(
    dataset,
    main_table_name,
    label_column_name,
    test_size=0.25,
    random_state=None,
) -> Tuple:
    """
    Splits X and y into random train and test subsets stratified by
    labels and protected attributes.

    Behaves similar to the `train_test_split`_ function from scikit-learn.

    .. _`train_test_split`: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

    Parameters
    ----------
    dataset : list of either Pandas or Spark dataframes

      Each dataframe in the list corresponds to an entity/table in the multi-table setting.

    main_table_name : string

      The name of the main table as the split is going to be based on the main table.

    label_column_name : string

      The name of the label column from the main table.

    test_size : float or int, default=0.25

      If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
      If int, represents the absolute number of test samples.

    random_state : int, RandomState instance or None, default=None

      Controls the shuffling applied to the data before applying the split.
      Pass an integer for reproducible output across multiple function calls.

      - None

          RandomState used by numpy.random

      - numpy.random.RandomState

          Use the provided random state, only affecting other users of that same random state instance.

      - integer

          Explicit seed.

    Returns
    -------
    result : tuple

      - item 0: train_X, List of datasets corresponding to the train split

      - item 1: test_X, List of datasets corresponding to the test split

      - item 2: train_y

      - item 3: test_y

    """
    main_table_df = None
    index_of_main_table = -1
    for i, df in enumerate(dataset):
        if get_table_name(df) == main_table_name:
            main_table_df = df
            index_of_main_table = i
    if main_table_df is None:
        table_names = [get_table_name(df) for df in dataset]
        raise ValueError(
            f"Could not find {main_table_name} in the given dataset, the table names are {table_names}"
        )
    if _is_pandas_df(main_table_df):
        num_rows = len(main_table_df)
    elif _is_spark_df(main_table_df):
        # main_table_df = main_table_df.toPandas()
        num_rows = main_table_df.count()
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )
    if test_size > 0 and test_size < 1:
        num_test_rows = int(num_rows * test_size)
    else:
        num_test_rows = test_size
    test_indices = random.choice(range(num_rows), num_test_rows, replace=False)
    train_indices = list(set([*range(num_rows)]) - set(test_indices.tolist()))
    assert len(test_indices) + len(train_indices) == num_rows
    train_dataset = [table for table in dataset]
    test_dataset = [table for table in dataset]
    if _is_pandas_df(main_table_df):
        train_main_df = main_table_df.iloc[train_indices]
        test_main_df = main_table_df.iloc[test_indices]
        train_y = train_main_df[label_column_name]
        test_y = test_main_df[label_column_name]
    elif _is_spark_df(main_table_df):
        spark_session = SparkSession.builder.appName(
            "multitable_train_test_split").getOrCreate()
        train_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[train_indices])
        test_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[test_indices])
        train_y = train_main_df.select(label_column_name)
        test_y = test_main_df.select(label_column_name)
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )

    train_main_df = add_table_name(train_main_df, main_table_name)
    test_main_df = add_table_name(test_main_df, main_table_name)
    train_dataset[index_of_main_table] = train_main_df
    test_dataset[index_of_main_table] = test_main_df
    return train_dataset, test_dataset, train_y, test_y
Exemple #27
0
def _df_count(X):
    if _is_pandas_df(X):
        return len(X)
    elif _is_spark_df(X):
        return X.count()
Exemple #28
0
        def filter(X):
            if isinstance(op, ast.Name):
                # currently only handles single argument predicates
                functions_module = importlib.import_module("lale.lib.lale.functions")
                func = getattr(functions_module, "filter_" + op.id)
                return func(X, lhs)

            # Filtering spark dataframes
            if _is_spark_df(X):
                if isinstance(op, ast.Eq):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) == col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) == rhs)
                    )
                elif isinstance(op, ast.NotEq):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) != col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) != rhs)
                    )
                elif isinstance(op, ast.GtE):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) >= col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) >= rhs)
                    )
                elif isinstance(op, ast.Gt):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) > col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) > rhs)
                    )
                elif isinstance(op, ast.LtE):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) <= col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) <= rhs)
                    )
                elif isinstance(op, ast.Lt):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X.filter(col(lhs) < col(rhs))
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X.filter(col(lhs) < rhs)
                    )
                else:
                    raise ValueError(
                        "{} operator type found. Only ==, !=, >=, <=, >, < operators are supported".format(
                            op
                        )
                    )
            # Filtering pandas dataframes
            if _is_pandas_df(X):
                assert lhs is not None
                assert rhs is not None
                if isinstance(op, ast.Eq):
                    return (
                        X[X[lhs] == X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] == rhs]
                    )
                elif isinstance(op, ast.NotEq):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] != X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] != rhs]
                    )
                elif isinstance(op, ast.GtE):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] >= X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] >= rhs]
                    )
                elif isinstance(op, ast.Gt):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] > X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] > rhs]
                    )
                elif isinstance(op, ast.LtE):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] <= X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] <= rhs]
                    )
                elif isinstance(op, ast.Lt):
                    assert lhs is not None
                    assert rhs is not None
                    return (
                        X[X[lhs] < X[rhs]]
                        if _is_ast_subs_or_attr(expr_to_parse.comparators[0])
                        else X[X[lhs] < rhs]
                    )
                else:
                    raise ValueError(
                        "{} operator type found. Only ==, !=, >=, <=, >, < operators are supported".format(
                            op
                        )
                    )
            else:
                raise ValueError(
                    "Only pandas and spark dataframes are supported by the filter operator."
                )
Exemple #29
0
    def transform(self, X):
        if all([_is_pandas(d) for d in X]):
            name2series = {}
            for dataset in X:
                if _is_pandas_df(dataset):
                    for name in dataset.columns:
                        name2series[name] = name2series.get(
                            name, []) + [dataset[name]]
                elif _is_pandas_series(dataset):
                    name = dataset.name
                    name2series[name] = name2series.get(name, []) + [dataset]
                else:
                    assert False
            duplicates = [
                name for name, ls in name2series.items() if len(ls) > 1
            ]
            if len(duplicates) == 0:
                result = pd.concat(X, axis=1)
            else:
                logger.info(
                    f"ConcatFeatures duplicate column names {duplicates}")
                deduplicated = [ls[-1] for _, ls in name2series.items()]
                result = pd.concat(deduplicated, axis=1)
        elif all([_is_spark_df(d) for d in X]):

            def join(d1, d2):
                n1 = get_table_name(d1)
                n2 = get_table_name(d2)
                if n1 is None or n2 is None:
                    raise ValueError(
                        "Table names are required to concatenate features of Spark dataframes"
                    )
                index_col1 = get_index_name(d1)
                index_col2 = get_index_name(d2)
                if index_col1 is None or index_col2 is None:
                    raise ValueError(
                        "Index columns are required to concatenate features of Spark dataframes"
                    )
                transformer = Join(
                    pred=[it[n1][index_col1] == it[n2][index_col2]])
                return transformer.transform([d1, d2])

            result = reduce(join, X)
        elif all([_is_pandas(d) or _is_spark_df(d) for d in X]):
            X = [d.toPandas() if _is_spark_df(d) else d for d in X]
            return self.transform(X)
        else:
            np_datasets = []
            # Preprocess the datasets to convert them to 2-d numpy arrays
            for dataset in X:
                if _is_pandas(dataset):
                    np_dataset = dataset.values
                elif _is_spark_df(dataset):
                    np_dataset = dataset.toPandas().values
                elif isinstance(dataset, scipy.sparse.csr_matrix):
                    np_dataset = dataset.toarray()
                elif torch_installed and isinstance(dataset, torch.Tensor):
                    np_dataset = dataset.detach().cpu().numpy()
                else:
                    np_dataset = dataset
                if hasattr(np_dataset, "shape"):
                    if len(np_dataset.shape
                           ) == 1:  # To handle numpy column vectors
                        np_dataset = np.reshape(np_dataset,
                                                (np_dataset.shape[0], 1))
                np_datasets.append(np_dataset)
            result = np.concatenate(np_datasets, axis=1)
        return result