create_add_one_struct_udf( result_formatter=lambda v1, v2: [np.array(v1), np.array(v2)]), # list of np.array, create_add_one_struct_udf(result_formatter=lambda v1, v2: np.array( [np.array(v1), np.array(v2)])), # np.array of np.array, create_add_one_struct_udf( result_formatter=lambda v1, v2: pd.DataFrame({ 'col1': v1, 'col2': v2 })), # pd.DataFrame, ] @elementwise( input_type=[dt.double], output_type=dt.Struct(['double_col', 'col2'], [dt.double, dt.double]), ) def overwrite_struct_elementwise(v): assert isinstance(v, pd.Series) return v + 1, v + 2 @elementwise( input_type=[dt.double], output_type=dt.Struct(['double_col', 'col2', 'float_col'], [dt.double, dt.double, dt.double]), ) def multiple_overwrite_struct_elementwise(v): assert isinstance(v, pd.Series) return v + 1, v + 2, v + 3
def create_demean_struct_udf(result_formatter): return analytic( input_type=[dt.double, dt.double], output_type=dt.Struct(['demean', 'demean_weight'], [dt.double, dt.double]), )(_format_struct_udf_return_type(demean_struct, result_formatter))
return s + 1 @analytic(input_type=[dt.double], output_type=dt.double) def calc_zscore(s): return (s - s.mean()) / s.std() @reduction(input_type=[dt.double], output_type=dt.double) def calc_mean(s): return s.mean() @elementwise( input_type=[dt.double], output_type=dt.Struct(['col1', 'col2'], [dt.double, dt.double]), ) def add_one_struct(v): return v + 1, v + 2 @analytic( input_type=[dt.double, dt.double], output_type=dt.Struct(['demean', 'demean_weight'], [dt.double, dt.double]), ) def demean_struct(v, w): return v - v.mean(), w - w.mean() @reduction( input_type=[dt.double, dt.double],
def test_struct_from_dict(): result = dt.Struct.from_dict({'b': 'int64', 'a': dt.float64}) assert result == dt.Struct(names=['b', 'a'], types=[dt.int64, dt.float64])
def create_add_one_struct_udf(result_formatter): return elementwise( input_type=[dt.double], output_type=dt.Struct(['col1', 'col2'], [dt.double, dt.double]), )(_format_struct_udf_return_type(add_one_struct, result_formatter))
def create_mean_struct_udf(result_formatter): return reduction( input_type=[dt.double, dt.double], output_type=dt.Struct(['mean', 'mean_weight'], [dt.double, dt.double]), )(_format_struct_udf_return_type(mean_struct, result_formatter))
def spark_struct_dtype_to_ibis_dtype(spark_dtype_obj, nullable=True): names = spark_dtype_obj.names fields = spark_dtype_obj.fields ibis_types = [dt.dtype(f.dataType, nullable=f.nullable) for f in fields] return dt.Struct(names, ibis_types, nullable=nullable)