コード例 #1
0
def _is_numeric_df(X):
    if _is_pandas_df(X):
        return X.shape[1] == X.select_dtypes(include=np.number).shape[1]
    elif _is_spark_df(X):
        from pyspark.sql.types import NumericType

        numeric_cols = [
            f.name for f in X.schema.fields
            if isinstance(f.dataType, NumericType)
        ]
        if _is_spark_with_index(X) and get_index_name(X) in numeric_cols:
            numeric_cols.remove(get_index_name(X))
        return len(get_columns(X)) == len(numeric_cols)
    else:
        return False
コード例 #2
0
 def join(d1, d2):
     n1 = get_table_name(d1)
     n2 = get_table_name(d2)
     if n1 is None or n2 is None:
         raise ValueError(
             "Table names are required to concatenate features of Spark dataframes"
         )
     index_col1 = get_index_name(d1)
     index_col2 = get_index_name(d2)
     if index_col1 is None or index_col2 is None:
         raise ValueError(
             "Index columns are required to concatenate features of Spark dataframes"
         )
     transformer = Join(
         pred=[it[n1][index_col1] == it[n2][index_col2]])
     return transformer.transform([d1, d2])
コード例 #3
0
ファイル: join.py プロジェクト: hirzel/lale
 def add_index(left_key_col, left_df, right_key_col, right_df, joined_df):
     if _is_spark_with_index(left_df) and _is_spark_with_index(right_df):
         left_name = get_index_name(left_df)
         right_name = get_index_name(right_df)
         if left_name in left_key_col and right_name in right_key_col:
             if left_name == right_name:
                 joined_df = SparkDataFrameWithIndex(
                     joined_df, index_name=left_name
                 )
             else:
                 warnings.warn(f"New data column {right_name}")
                 joined_df = SparkDataFrameWithIndex(
                     joined_df, index_name=left_name
                 )
         elif left_name in left_key_col:
             joined_df = SparkDataFrameWithIndex(joined_df, index_name=left_name)
         elif right_name in right_key_col:
             joined_df = SparkDataFrameWithIndex(
                 joined_df, index_name=right_name
             )
         else:
             pass
     elif _is_spark_with_index(left_df):
         index_name = get_index_name(left_df)
         if index_name in left_key_col:
             joined_df = SparkDataFrameWithIndex(
                 joined_df, index_name=index_name
             )
     elif _is_spark_with_index(right_df):
         index_name = get_index_name(right_df)
         if index_name in right_key_col:
             joined_df = SparkDataFrameWithIndex(
                 joined_df, index_name=index_name
             )
     else:
         assert False
     return joined_df
コード例 #4
0
ファイル: group_by.py プロジェクト: hirzel/lale
 def transform(self, X):
     group_by_keys = []
     for by_element in self.by if self.by is not None else []:
         expr_to_parse = by_element._expr
         group_by_keys.append(self._get_group_key(expr_to_parse))
     col_not_in_X = np.setdiff1d(group_by_keys, get_columns(X))
     if col_not_in_X.size > 0:
         raise ValueError(
             "GroupBy key columns {} not present in input dataframe X.".
             format(col_not_in_X))
     if _is_spark_with_index(X):
         name = get_table_name(X)
         X = add_table_name(X.drop(get_index_name(X)), name)
     if _is_spark_df(X):
         grouped_df = X.groupby(group_by_keys)
     elif _is_pandas_df(X):
         grouped_df = X.groupby(group_by_keys, sort=False)
     else:
         raise ValueError(
             "Only pandas and spark dataframes are supported by the GroupBy operator."
         )
     named_grouped_df = add_table_name(grouped_df, get_table_name(X))
     return named_grouped_df
コード例 #5
0
ファイル: join.py プロジェクト: hirzel/lale
 def remove_implicit_col(key_col, df):
     if _is_spark_with_index(df):
         index = get_index_name(df)
         if index not in key_col:
             df = df.drop(index)
     return df