def join_df(left_df, right_df): # Joining spark dataframes if _is_spark_df(left_df) and _is_spark_df(right_df): on = [] drop_col = [] left_table = left_df.alias("left_table") right_table = right_df.alias("right_table") for k, key in enumerate(left_key_col): on.append( col("{}.{}".format("left_table", key)).eqNullSafe( col("{}.{}".format("right_table", right_key_col[k])))) if key == right_key_col[k]: drop_col.append(key) op_df = left_table.join(right_table, on, self.join_type) for key in drop_col: op_df = op_df.drop(getattr(right_table, key)) return op_df # Joining pandas dataframes op_df = pd.merge( left_df, right_df, how=self.join_type, left_on=left_key_col, right_on=right_key_col, ) return op_df
def make_series_concat(df1, df2): if isinstance(df1, np.ndarray): assert isinstance(df2, np.ndarray) return np.concatenate((df1, df2)) elif isinstance(df1, pd.Series): assert isinstance(df2, pd.Series) return pd.concat([df1, df2]) elif _is_spark_df(df1): assert _is_spark_df(df2) return df1.union(df2) else: raise ValueError(f"Unsupported series type {type(df1)}")
def transform(self, X): table_name = lale.datasets.data_schemas.get_table_name(X) columns_to_keep = [] def get_map_function_output(column, new_column_name): functions_module = importlib.import_module("lale.lib.lale.functions") if _is_ast_subscript(column._expr) or _is_ast_attribute(column._expr): function_name = "identity" else: function_name = column._expr.func.id map_func_to_be_called = getattr(functions_module, function_name) return map_func_to_be_called(X, column, new_column_name) if isinstance(self.columns, list): for column in self.columns: new_column_name, X = get_map_function_output(column, None) columns_to_keep.append(new_column_name) elif isinstance(self.columns, dict): for new_column_name, column in self.columns.items(): new_column_name, X = get_map_function_output(column, new_column_name) columns_to_keep.append(new_column_name) else: raise ValueError("columns must be either a list or a dictionary.") mapped_df = X # Do nothing as X already has the right columns if self.remainder == "drop": if _is_pandas_df(X): mapped_df = X[columns_to_keep] elif _is_spark_df(X): mapped_df = X.select(columns_to_keep) else: raise ValueError( "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe." ) mapped_df = lale.datasets.data_schemas.add_table_name(mapped_df, table_name) return mapped_df
def identity(df: Any, column: Expr, new_column_name: str): if _is_ast_subscript(column._expr): # type: ignore column_name = column._expr.slice.value.s # type: ignore elif _is_ast_attribute(column._expr): # type: ignore column_name = column._expr.attr # type: ignore else: raise ValueError( "Expression type not supported. Formats supported: it.column_name or it['column_name']." ) if column_name is None or not column_name.strip(): raise ValueError( "Name of the column to be renamed cannot be None or empty.") if new_column_name is None or not new_column_name.strip(): raise ValueError( "New name of the column to be renamed cannot be None or empty.") if _is_pandas_df(df): df = df.rename(columns={column_name: new_column_name}) elif spark_installed and _is_spark_df(df): df = df.withColumnRenamed(column_name, new_column_name) else: raise ValueError( "Function identity supports only Pandas dataframes or spark dataframes." ) return new_column_name, df
def time_functions(df: Any, dom_expr: Expr, new_column_name: str, pandas_func: str, spark_func: str): fmt = None de: Any = dom_expr._expr column_name = de.args[0].attr if new_column_name is None: new_column_name = column_name if len(de.args) > 1: fmt = ast.literal_eval(de.args[1]) if _is_pandas_df(df): new_column = pd.to_datetime(df[column_name], format=fmt) df[new_column_name] = getattr(getattr(new_column, "dt"), pandas_func) if new_column_name != column_name: del df[column_name] elif spark_installed and _is_spark_df(df): df = df.withColumn(column_name, to_timestamp(df[column_name], fmt)) # type: ignore df = df.select( eval(spark_func + "(df[column_name])").alias(new_column_name)) if new_column_name != column_name: df = df.drop(column_name) else: raise ValueError( "function day_of_month supports only Pandas dataframes or spark dataframes." ) return new_column_name, df
def string_indexer(df: pd.DataFrame, dom_expr: Expr, new_column_name: str): de: Any = dom_expr._expr column_name = de.args[0].attr if new_column_name is None: new_column_name = column_name if _is_pandas_df(df): sorted_indices = df[column_name].value_counts().index df[new_column_name] = df[column_name].map( dict(zip(sorted_indices, range(0, len(sorted_indices))))) if new_column_name != column_name: del df[column_name] elif spark_installed and _is_spark_df(df): df = df.withColumnRenamed( column_name, "newColName" ) # renaming because inputCol and outputCol can't be the same. indexer = StringIndexer(inputCol="newColName", outputCol=new_column_name) df = indexer.fit(df).transform(df) df = df.drop("newColName") else: raise ValueError( "function day_of_month supports only Pandas dataframes or spark dataframes." ) return new_column_name, df
def to_schema(obj) -> JSON_TYPE: result = None if obj is None: result = {"enum": [None]} elif isinstance(obj, np.ndarray): result = ndarray_to_schema(obj) elif isinstance(obj, scipy.sparse.csr_matrix): result = csr_matrix_to_schema(obj) elif isinstance(obj, pd.DataFrame): result = dataframe_to_schema(obj) elif isinstance(obj, pd.Series): result = series_to_schema(obj) elif torch_installed and isinstance(obj, torch.Tensor): result = torch_tensor_to_schema(obj) elif is_liac_arff(obj): result = liac_arff_to_schema(obj) elif lale.type_checking.is_schema(obj): result = obj elif isinstance(obj, list): result = list_tensor_to_schema(obj) elif _is_spark_df(obj): result = dataframe_to_schema(obj.toPandas()) if result is None: raise ValueError(f"to_schema(obj), type {type(obj)}, value {obj}") lale.type_checking.validate_is_schema(result) return result
def filter_isnotnull(df: Any, column_name: str): if _is_pandas_df(df): return df[df[column_name].notnull()] elif spark_installed and _is_spark_df(df): return df.filter(~pyspark.sql.functions.isnull(df[column_name])) else: raise ValueError( "the filter isnotnan supports only Pandas dataframes or spark dataframes." )
def count(df): if isinstance(df, np.ndarray): return df.size if _is_pandas_df(df) or _is_pandas_series(df): return len(df) elif _is_spark_df(df): return df.count() else: return len(df)
def transform(self, X): if _is_pandas_df(X): return self.transform_pandas_df(X) elif _is_spark_df(X): return self.transform_spark_df(X) else: raise ValueError( f"Only Pandas or Spark dataframe are supported as inputs, got {type(X)}. Please check that pyspark is installed if you see this error for a Spark dataframe." )
def make_series_distinct(df): if isinstance(df, np.ndarray): return np.unique(df) elif isinstance(df, pd.Series): return df.unique() elif _is_spark_df(df): return df.distinct() else: raise ValueError(f"Unsupported series type {type(df)}")
def select_col(df, col: column_index): if isinstance(df, np.ndarray): return df[col] elif _is_pandas_df(df): return df[col] elif _is_spark_df(df): return df.select(col) else: raise ValueError(f"Unsupported series type {type(df)}")
def get_obj_cols(df): """ Returns names of 'object' columns in the DataFrame. """ obj_cols = [] if _is_pandas_df(df): for idx, dt in enumerate(df.dtypes): if dt == "object" or is_category(dt): obj_cols.append(df.columns.values[idx]) elif _is_spark_df(df): assert False, "Not yet implemented" return obj_cols
def _is_string_df(X): if _is_pandas_df(X): return X.shape[1] == X.select_dtypes(include="object").shape[1] elif _is_spark_df(X): from pyspark.sql.types import StringType numeric_cols = [ f.name for f in X.schema.fields if isinstance(f.dataType, StringType) ] return len(get_columns(X)) == len(numeric_cols) else: return False
def get_columns(df) -> List[column_index]: if _is_pandas_series(df): return pd.Series([df.name]) if _is_pandas_df(df): return df.columns if _is_spark_with_index(df): return pd.Series(df.columns_without_index) if _is_spark_df(df): return df.columns if isinstance(df, np.ndarray): # should have more asserts here _, num_cols = df.shape return list(range(num_cols)) assert False, type(df)
def split_df(self, X): if self.label_name not in X.columns: return X, None if _is_pandas_df(X): y = pd.DataFrame(X[self.label_name]) X = X.drop(self.label_name, axis=1) elif _is_spark_df(X): y = X.select(X[self.label_name]) X = X.drop(self.label_name) else: raise ValueError( "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe." ) return X, y
def _lift(X, hyperparams): feature_names_in_ = get_columns(X) strategy = hyperparams["strategy"] if strategy == "constant": fill_value = _SimpleImputerImpl._get_fill_value(X, hyperparams) agg_data = [[fill_value for col in get_columns(X)]] lifted_statistics = pd.DataFrame(agg_data, columns=get_columns(X)) elif strategy == "mean": agg_op_sum = Aggregate( columns={c: sum(it[c]) for c in get_columns(X)}, exclude_value=hyperparams["missing_values"], ) agg_op_count = Aggregate( columns={c: count(it[c]) for c in get_columns(X)}, exclude_value=hyperparams["missing_values"], ) lifted_statistics = {} agg_sum = agg_op_sum.transform(X) if agg_sum is not None and _is_spark_df(agg_sum): agg_sum = agg_sum.toPandas() agg_count = agg_op_count.transform(X) if agg_count is not None and _is_spark_df(agg_count): agg_count = agg_count.toPandas() lifted_statistics["sum"] = agg_sum lifted_statistics["count"] = agg_count else: raise ValueError( "_lift is only supported for imputation strategy `mean` and `constant`." ) return ( feature_names_in_, lifted_statistics, strategy, ) # strategy is added so that _combine can use it
def _is_numeric_df(X): if _is_pandas_df(X): return X.shape[1] == X.select_dtypes(include=np.number).shape[1] elif _is_spark_df(X): from pyspark.sql.types import NumericType numeric_cols = [ f.name for f in X.schema.fields if isinstance(f.dataType, NumericType) ] if _is_spark_with_index(X) and get_index_name(X) in numeric_cols: numeric_cols.remove(get_index_name(X)) return len(get_columns(X)) == len(numeric_cols) else: return False
def _set_fit_attributes(self, lifted): # set attribute values self.feature_names_in_ = lifted[0] self.n_features_in_ = len(self.feature_names_in_) self._lifted_statistics = lifted[1] strategy = self._hyperparams["strategy"] if strategy == "constant": self.statistics_ = self._lifted_statistics.to_numpy()[0] elif strategy == "mean": self.statistics_ = (self._lifted_statistics["sum"] / self._lifted_statistics["count"]).to_numpy()[0] else: agg_data = self._lifted_statistics if agg_data is not None and _is_spark_df(agg_data): agg_data = agg_data.toPandas() if agg_data is not None and _is_pandas_df(agg_data): self.statistics_ = agg_data.to_numpy()[ 0] # Converting from a 2-d array to 1-d self._transformer = None
def _lift(X, hyperparams): agg = {f"{c}_min": agg_min(it[c]) for c in X.columns} agg.update({f"{c}_max": agg_max(it[c]) for c in X.columns}) aggregate = Aggregate(columns=agg) data_min_max = aggregate.transform(X) if _is_spark_df(X): data_min_max = data_min_max.toPandas() n = len(X.columns) data_min_ = np.zeros(shape=(n)) data_max_ = np.zeros(shape=(n)) for i, c in enumerate(X.columns): data_min_[i] = data_min_max[f"{c}_min"] data_max_[i] = data_min_max[f"{c}_max"] data_min_ = np.array(data_min_) data_max_ = np.array(data_max_) n_samples_seen_ = _df_count(X) n_features_in_ = len(X.columns) feature_names_in_ = X.columns return data_min_, data_max_, n_samples_seen_, n_features_in_, feature_names_in_
def transform(self, X): group_by_keys = [] for by_element in self.by if self.by is not None else []: expr_to_parse = by_element._expr group_by_keys.append(self._get_group_key(expr_to_parse)) col_not_in_X = np.setdiff1d(group_by_keys, X.columns) if col_not_in_X.size > 0: raise ValueError( "GroupBy key columns {} not present in input dataframe X.". format(col_not_in_X)) if _is_spark_df(X): grouped_df = X.groupby(group_by_keys) elif _is_pandas_df(X): grouped_df = X.groupby(group_by_keys, sort=False) else: raise ValueError( "Only pandas and spark dataframes are supported by the GroupBy operator." ) named_grouped_df = lale.datasets.data_schemas.add_table_name( grouped_df, lale.datasets.data_schemas.get_table_name(X)) return named_grouped_df
def transform(self, X): by = self.by orders: List[Tuple[str, bool]] if isinstance(by, list): orders = [self._get_order_key(k) for k in by] else: orders = [self._get_order_key(by)] cols: List[str] = [col for col, _ in orders] ascs: List[bool] = [asc for _, asc in orders] if _is_pandas_df(X): ordered_df = X.sort_values(by=cols, ascending=ascs) elif _is_spark_df(X): ordered_df = X.orderBy(cols, ascending=ascs) else: raise ValueError( "Only Pandas or Spark dataframe are supported as inputs. Please check that pyspark is installed if you see this error for a Spark dataframe." ) ordered_df = forward_metadata(X, ordered_df) return ordered_df
def replace(df: Any, replace_expr: Expr, new_column_name: str): re: Any = replace_expr._expr column_name = re.args[0].attr if new_column_name is None: new_column_name = column_name mapping_dict = ast.literal_eval(re.args[1].value) if _is_pandas_df(df): new_column = df[column_name].replace(mapping_dict) df[new_column_name] = new_column if new_column_name != column_name: del df[column_name] elif spark_installed and _is_spark_df(df): mapping_expr = create_map( [lit(x) for x in chain(*mapping_dict.items())]) # type: ignore df = df.withColumn(new_column_name, mapping_expr[df[column_name]]) # type: ignore if new_column_name != column_name: df = df.drop(column_name) else: raise ValueError( "function replace supports only Pandas dataframes or spark dataframes." ) return new_column_name, df
def to_monoid(self, v) -> _MinMaxScalerMonoid: X, _ = v agg = {f"{c}_min": agg_min(it[c]) for c in get_columns(X)} agg.update({f"{c}_max": agg_max(it[c]) for c in get_columns(X)}) aggregate = Aggregate(columns=agg) data_min_max = aggregate.transform(X) if _is_spark_df(X): data_min_max = data_min_max.toPandas() n = len(get_columns(X)) data_min_ = np.zeros(shape=(n)) data_max_ = np.zeros(shape=(n)) for i, c in enumerate(get_columns(X)): data_min_[i] = data_min_max[f"{c}_min"] data_max_[i] = data_min_max[f"{c}_max"] data_min_ = np.array(data_min_) data_max_ = np.array(data_max_) n_samples_seen_ = count(X) feature_names_in_ = get_columns(X) return _MinMaxScalerMonoid( data_min_=data_min_, data_max_=data_max_, n_samples_seen_=n_samples_seen_, feature_names_in_=feature_names_in_, )
def __call__(self, X, y=None): if not _is_spark_df(X): logger.warning( f"SparkExplain called with non spark data of type {type(X)}") else: X.explain(extended=self._extended, mode=self._mode)
def multitable_train_test_split( dataset, main_table_name, label_column_name, test_size=0.25, random_state=None, ) -> Tuple: """ Splits X and y into random train and test subsets stratified by labels and protected attributes. Behaves similar to the `train_test_split`_ function from scikit-learn. .. _`train_test_split`: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html Parameters ---------- dataset : list of either Pandas or Spark dataframes Each dataframe in the list corresponds to an entity/table in the multi-table setting. main_table_name : string The name of the main table as the split is going to be based on the main table. label_column_name : string The name of the label column from the main table. test_size : float or int, default=0.25 If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. random_state : int, RandomState instance or None, default=None Controls the shuffling applied to the data before applying the split. Pass an integer for reproducible output across multiple function calls. - None RandomState used by numpy.random - numpy.random.RandomState Use the provided random state, only affecting other users of that same random state instance. - integer Explicit seed. Returns ------- result : tuple - item 0: train_X, List of datasets corresponding to the train split - item 1: test_X, List of datasets corresponding to the test split - item 2: train_y - item 3: test_y """ main_table_df = None index_of_main_table = -1 for i, df in enumerate(dataset): if get_table_name(df) == main_table_name: main_table_df = df index_of_main_table = i if main_table_df is None: table_names = [get_table_name(df) for df in dataset] raise ValueError( f"Could not find {main_table_name} in the given dataset, the table names are {table_names}" ) if _is_pandas_df(main_table_df): num_rows = len(main_table_df) elif _is_spark_df(main_table_df): # main_table_df = main_table_df.toPandas() num_rows = main_table_df.count() else: raise ValueError( "multitable_train_test_split can only work with a list of Pandas or Spark dataframes." ) if test_size > 0 and test_size < 1: num_test_rows = int(num_rows * test_size) else: num_test_rows = test_size test_indices = random.choice(range(num_rows), num_test_rows, replace=False) train_indices = list(set([*range(num_rows)]) - set(test_indices.tolist())) assert len(test_indices) + len(train_indices) == num_rows train_dataset = [table for table in dataset] test_dataset = [table for table in dataset] if _is_pandas_df(main_table_df): train_main_df = main_table_df.iloc[train_indices] test_main_df = main_table_df.iloc[test_indices] train_y = train_main_df[label_column_name] test_y = test_main_df[label_column_name] elif _is_spark_df(main_table_df): spark_session = SparkSession.builder.appName( "multitable_train_test_split").getOrCreate() train_main_df = spark_session.createDataFrame( data=main_table_df.toPandas().iloc[train_indices]) test_main_df = spark_session.createDataFrame( data=main_table_df.toPandas().iloc[test_indices]) train_y = train_main_df.select(label_column_name) test_y = test_main_df.select(label_column_name) else: raise ValueError( "multitable_train_test_split can only work with a list of Pandas or Spark dataframes." ) train_main_df = add_table_name(train_main_df, main_table_name) test_main_df = add_table_name(test_main_df, main_table_name) train_dataset[index_of_main_table] = train_main_df test_dataset[index_of_main_table] = test_main_df return train_dataset, test_dataset, train_y, test_y
def _df_count(X): if _is_pandas_df(X): return len(X) elif _is_spark_df(X): return X.count()
def filter(X): if isinstance(op, ast.Name): # currently only handles single argument predicates functions_module = importlib.import_module("lale.lib.lale.functions") func = getattr(functions_module, "filter_" + op.id) return func(X, lhs) # Filtering spark dataframes if _is_spark_df(X): if isinstance(op, ast.Eq): assert lhs is not None assert rhs is not None return ( X.filter(col(lhs) == col(rhs)) if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X.filter(col(lhs) == rhs) ) elif isinstance(op, ast.NotEq): assert lhs is not None assert rhs is not None return ( X.filter(col(lhs) != col(rhs)) if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X.filter(col(lhs) != rhs) ) elif isinstance(op, ast.GtE): assert lhs is not None assert rhs is not None return ( X.filter(col(lhs) >= col(rhs)) if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X.filter(col(lhs) >= rhs) ) elif isinstance(op, ast.Gt): assert lhs is not None assert rhs is not None return ( X.filter(col(lhs) > col(rhs)) if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X.filter(col(lhs) > rhs) ) elif isinstance(op, ast.LtE): assert lhs is not None assert rhs is not None return ( X.filter(col(lhs) <= col(rhs)) if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X.filter(col(lhs) <= rhs) ) elif isinstance(op, ast.Lt): assert lhs is not None assert rhs is not None return ( X.filter(col(lhs) < col(rhs)) if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X.filter(col(lhs) < rhs) ) else: raise ValueError( "{} operator type found. Only ==, !=, >=, <=, >, < operators are supported".format( op ) ) # Filtering pandas dataframes if _is_pandas_df(X): assert lhs is not None assert rhs is not None if isinstance(op, ast.Eq): return ( X[X[lhs] == X[rhs]] if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X[X[lhs] == rhs] ) elif isinstance(op, ast.NotEq): assert lhs is not None assert rhs is not None return ( X[X[lhs] != X[rhs]] if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X[X[lhs] != rhs] ) elif isinstance(op, ast.GtE): assert lhs is not None assert rhs is not None return ( X[X[lhs] >= X[rhs]] if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X[X[lhs] >= rhs] ) elif isinstance(op, ast.Gt): assert lhs is not None assert rhs is not None return ( X[X[lhs] > X[rhs]] if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X[X[lhs] > rhs] ) elif isinstance(op, ast.LtE): assert lhs is not None assert rhs is not None return ( X[X[lhs] <= X[rhs]] if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X[X[lhs] <= rhs] ) elif isinstance(op, ast.Lt): assert lhs is not None assert rhs is not None return ( X[X[lhs] < X[rhs]] if _is_ast_subs_or_attr(expr_to_parse.comparators[0]) else X[X[lhs] < rhs] ) else: raise ValueError( "{} operator type found. Only ==, !=, >=, <=, >, < operators are supported".format( op ) ) else: raise ValueError( "Only pandas and spark dataframes are supported by the filter operator." )
def transform(self, X): if all([_is_pandas(d) for d in X]): name2series = {} for dataset in X: if _is_pandas_df(dataset): for name in dataset.columns: name2series[name] = name2series.get( name, []) + [dataset[name]] elif _is_pandas_series(dataset): name = dataset.name name2series[name] = name2series.get(name, []) + [dataset] else: assert False duplicates = [ name for name, ls in name2series.items() if len(ls) > 1 ] if len(duplicates) == 0: result = pd.concat(X, axis=1) else: logger.info( f"ConcatFeatures duplicate column names {duplicates}") deduplicated = [ls[-1] for _, ls in name2series.items()] result = pd.concat(deduplicated, axis=1) elif all([_is_spark_df(d) for d in X]): def join(d1, d2): n1 = get_table_name(d1) n2 = get_table_name(d2) if n1 is None or n2 is None: raise ValueError( "Table names are required to concatenate features of Spark dataframes" ) index_col1 = get_index_name(d1) index_col2 = get_index_name(d2) if index_col1 is None or index_col2 is None: raise ValueError( "Index columns are required to concatenate features of Spark dataframes" ) transformer = Join( pred=[it[n1][index_col1] == it[n2][index_col2]]) return transformer.transform([d1, d2]) result = reduce(join, X) elif all([_is_pandas(d) or _is_spark_df(d) for d in X]): X = [d.toPandas() if _is_spark_df(d) else d for d in X] return self.transform(X) else: np_datasets = [] # Preprocess the datasets to convert them to 2-d numpy arrays for dataset in X: if _is_pandas(dataset): np_dataset = dataset.values elif _is_spark_df(dataset): np_dataset = dataset.toPandas().values elif isinstance(dataset, scipy.sparse.csr_matrix): np_dataset = dataset.toarray() elif torch_installed and isinstance(dataset, torch.Tensor): np_dataset = dataset.detach().cpu().numpy() else: np_dataset = dataset if hasattr(np_dataset, "shape"): if len(np_dataset.shape ) == 1: # To handle numpy column vectors np_dataset = np.reshape(np_dataset, (np_dataset.shape[0], 1)) np_datasets.append(np_dataset) result = np.concatenate(np_datasets, axis=1) return result