def _is_numeric_df(X): if _is_pandas_df(X): return X.shape[1] == X.select_dtypes(include=np.number).shape[1] elif _is_spark_df(X): from pyspark.sql.types import NumericType numeric_cols = [ f.name for f in X.schema.fields if isinstance(f.dataType, NumericType) ] if _is_spark_with_index(X) and get_index_name(X) in numeric_cols: numeric_cols.remove(get_index_name(X)) return len(get_columns(X)) == len(numeric_cols) else: return False
def join(d1, d2): n1 = get_table_name(d1) n2 = get_table_name(d2) if n1 is None or n2 is None: raise ValueError( "Table names are required to concatenate features of Spark dataframes" ) index_col1 = get_index_name(d1) index_col2 = get_index_name(d2) if index_col1 is None or index_col2 is None: raise ValueError( "Index columns are required to concatenate features of Spark dataframes" ) transformer = Join( pred=[it[n1][index_col1] == it[n2][index_col2]]) return transformer.transform([d1, d2])
def add_index(left_key_col, left_df, right_key_col, right_df, joined_df): if _is_spark_with_index(left_df) and _is_spark_with_index(right_df): left_name = get_index_name(left_df) right_name = get_index_name(right_df) if left_name in left_key_col and right_name in right_key_col: if left_name == right_name: joined_df = SparkDataFrameWithIndex( joined_df, index_name=left_name ) else: warnings.warn(f"New data column {right_name}") joined_df = SparkDataFrameWithIndex( joined_df, index_name=left_name ) elif left_name in left_key_col: joined_df = SparkDataFrameWithIndex(joined_df, index_name=left_name) elif right_name in right_key_col: joined_df = SparkDataFrameWithIndex( joined_df, index_name=right_name ) else: pass elif _is_spark_with_index(left_df): index_name = get_index_name(left_df) if index_name in left_key_col: joined_df = SparkDataFrameWithIndex( joined_df, index_name=index_name ) elif _is_spark_with_index(right_df): index_name = get_index_name(right_df) if index_name in right_key_col: joined_df = SparkDataFrameWithIndex( joined_df, index_name=index_name ) else: assert False return joined_df
def transform(self, X): group_by_keys = [] for by_element in self.by if self.by is not None else []: expr_to_parse = by_element._expr group_by_keys.append(self._get_group_key(expr_to_parse)) col_not_in_X = np.setdiff1d(group_by_keys, get_columns(X)) if col_not_in_X.size > 0: raise ValueError( "GroupBy key columns {} not present in input dataframe X.". format(col_not_in_X)) if _is_spark_with_index(X): name = get_table_name(X) X = add_table_name(X.drop(get_index_name(X)), name) if _is_spark_df(X): grouped_df = X.groupby(group_by_keys) elif _is_pandas_df(X): grouped_df = X.groupby(group_by_keys, sort=False) else: raise ValueError( "Only pandas and spark dataframes are supported by the GroupBy operator." ) named_grouped_df = add_table_name(grouped_df, get_table_name(X)) return named_grouped_df
def remove_implicit_col(key_col, df): if _is_spark_with_index(df): index = get_index_name(df) if index not in key_col: df = df.drop(index) return df