def test_reregistration(c): def f(x): return x**2 # The same is fine c.register_function(f, "f", [("x", np.float64)], np.float64) c.register_function(f, "f", [("x", np.int64)], np.int64) def f(x): return x**3 # A different not with pytest.raises(ValueError): c.register_function(f, "f", [("x", np.float64)], np.float64) # only if we replace it c.register_function(f, "f", [("x", np.float64)], np.float64, replace=True) fagg = dd.Aggregation("f", lambda x: x.sum(), lambda x: x.sum()) c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64) c.register_aggregation(fagg, "fagg", [("x", np.int64)], np.int64) fagg = dd.Aggregation("f", lambda x: x.mean(), lambda x: x.mean()) with pytest.raises(ValueError): c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64) c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64, replace=True)
def q12(lineitem, orders): t1 = time.time() date1 = datetime.strptime("1994-01-01", '%Y-%m-%d') date2 = datetime.strptime("1995-01-01", '%Y-%m-%d') sel = ( (lineitem.L_RECEIPTDATE < date2) & (lineitem.L_COMMITDATE < date2) & (lineitem.L_SHIPDATE < date2) & (lineitem.L_SHIPDATE < lineitem.L_COMMITDATE) & (lineitem.L_COMMITDATE < lineitem.L_RECEIPTDATE) & (lineitem.L_RECEIPTDATE >= date1) & ((lineitem.L_SHIPMODE == "MAIL") | (lineitem.L_SHIPMODE == "SHIP")) ) flineitem = lineitem[sel] jn = flineitem.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY") gb = jn.groupby("L_SHIPMODE")["O_ORDERPRIORITY"] def g1(x): return x.apply(lambda s: ((s == "1-URGENT") | (s == "2-HIGH")).sum()) def g2(x): return x.apply(lambda s: ((s != "1-URGENT") & (s != "2-HIGH")).sum()) g1_agg = pd.Aggregation('g1', g1, lambda s0: s0.sum()) g2_agg = pd.Aggregation('g2', g2, lambda s0: s0.sum()) total = gb.agg([g1_agg, g2_agg]) total = total.compute().reset_index().sort_values("L_SHIPMODE") print(total) print("Q12 Execution time (s): ", time.time() - t1)
def group_data(df): """Aggregate the DataFrame and return the grouped DataFrame. :param df: DataFrame :returns: DataFrame """ # round timestamps down to an hour df['ts'] = df['ts'].dt.floor('1H') # group on customer, timestamp (rounded) and url gb = df.groupby(['customer', 'url', 'ts']) counter = dd.Aggregation( 'counter', lambda s: s.apply(counter_chunk), lambda s: s.apply(counter_agg), ) count_unique = dd.Aggregation('count_unique', lambda s: s.apply(nunique_chunk), lambda s: s.apply(nunique_agg)) ag = gb.agg({'session_id': [count_unique, 'count'], 'referrer': counter}) ag = ag.reset_index() # get rid of multilevel columns ag.columns = [ 'customer', 'url', 'ts', 'visitors', 'page_views', 'referrers' ] ag = ag.repartition(npartitions=df.npartitions) return ag
def get_function(self, agg_type=Library.PANDAS): if agg_type == Library.DASK: def chunk(s): def format_chunk(x): return x[:].fillna(0) chunk_sum = s.agg(lambda x: format_chunk(x).sum()) chunk_len = s.agg(lambda x: len(format_chunk(x))) if chunk_sum.dtype == 'bool': chunk_sum = chunk_sum.astype('int64') if chunk_len.dtype == 'bool': chunk_len = chunk_len.astype('int64') return (chunk_sum, chunk_len) def agg(val, length): return (val.sum(), length.sum()) def finalize(total, length): return total / length return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize) def percent_true(s): return s.fillna(0).mean() return percent_true
def test_function(c): c.sql("CREATE SCHEMA other") c.sql("USE SCHEMA root") def f(x): return x**2 c.register_function(f, "f", [("x", np.float64)], np.float64, schema_name="other") with pytest.raises(ParsingException): c.sql("SELECT F(a) AS a FROM df") c.sql("SELECT other.F(a) AS a FROM df") c.sql("USE SCHEMA other") c.sql("SELECT F(a) AS a FROM root.df") c.sql("USE SCHEMA root") fagg = dd.Aggregation("f", lambda x: x.sum(), lambda x: x.sum()) c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64, schema_name="other") with pytest.raises(ParsingException): c.sql("SELECT FAGG(b) AS test FROM df") c.sql("SELECT other.FAGG(b) AS test FROM df") c.sql("USE SCHEMA other") c.sql("SELECT FAGG(b) AS test FROM root.df")
def test_groupby_agg_custom__mode(): # mode function passing intermediates as pure python objects around. to protect # results from pandas in apply use return results as single-item lists def agg_mode(s): def impl(s): res, = s.iloc[0] for i, in s.iloc[1:]: res = res.add(i, fill_value=0) return [res] return s.apply(impl) agg_func = dd.Aggregation( 'custom_mode', lambda s: s.apply(lambda s: [s.value_counts()]), agg_mode, lambda s: s.map(lambda i: i[0].argmax()), ) d = pd.DataFrame({ 'g0': [0, 0, 0, 1, 1] * 3, 'g1': [0, 0, 0, 1, 1] * 3, 'cc': [4, 5, 4, 6, 6] * 3, }) a = dd.from_pandas(d, npartitions=5) actual = a['cc'].groupby([a['g0'], a['g1']]).agg(agg_func) # cheat to get the correct index expected = pd.DataFrame({'g0': [0, 1], 'g1': [0, 1], 'cc': [4, 6]}) expected = expected['cc'].groupby([expected['g0'], expected['g1']]).agg('sum') assert_eq(actual, expected)
def dask_agg_largest(): return dd.Aggregation(name='largest', chunk=lambda grouped: (grouped.max(), grouped.min()), agg=lambda chunk_max, chunk_min: (chunk_max.max(), chunk_min.min()), finalize=lambda M, m: np.sign(M + m) * abs( pd.concat([M, m], axis=1)).max(axis=1))
def get_function(self, agg_type=Library.PANDAS): if agg_type == Library.DASK: def chunk(s): def inner_chunk(x): x = x[:].dropna() return set(x.unique()) return s.agg(inner_chunk) def agg(s): def inner_agg(x): x = x[:].dropna() return (set().union(*x.values)) return s.agg(inner_agg) def finalize(s): return s.apply(lambda x: len(x)) return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize) elif agg_type == Library.KOALAS: return 'nunique' return pd.Series.nunique
def get_dask_aggregation(self): def chunk(s): return s.agg(np.all) def agg(s): return s.agg(np.all) return dd.Aggregation(self.name, chunk=chunk, agg=agg)
def test_groupby_agg_custom__name_clash_with_internal_same_column(): """for a single input column only unique names are allowed""" d = pd.DataFrame({'g': [0, 0, 1] * 3, 'b': [1, 2, 3] * 3}) a = dd.from_pandas(d, npartitions=2) agg_func = dd.Aggregation('sum', lambda s: s.sum(), lambda s0: s0.sum()) with pytest.raises(ValueError): a.groupby('g').aggregate({'b': [agg_func, 'sum']})
def test_aggregate_function(c): fagg = dd.Aggregation("f", lambda x: x.sum(), lambda x: x.sum()) c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64) return_df = c.sql(""" SELECT FAGG(b) AS test, SUM(b) AS "S" FROM df """) assert_eq(return_df["test"], return_df["S"], check_names=False)
def get_dask_aggregation(self): def chunk(s): return s.sum() def agg(s): return s.sum() def finalize(s): return s * self.n return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)
def get_dask_aggregation(self): def chunk(s): chunk_sum = s.agg(np.sum) if chunk_sum.dtype == 'bool': chunk_sum = chunk_sum.astype('int64') return chunk_sum def agg(s): return s.agg(np.sum) return dd.Aggregation(self.name, chunk=chunk, agg=agg)
def test_aggregate_function(c): fagg = dd.Aggregation("f", lambda x: x.sum(), lambda x: x.sum()) c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64) return_df = c.sql(""" SELECT fagg(b) AS test, sum(b) AS "S" FROM df """) return_df = return_df.compute() assert (return_df["test"] == return_df["S"]).all()
def get_function(self, agg_type=Library.PANDAS): if agg_type == Library.DASK: def chunk(s): return s.agg(np.all) def agg(s): return s.agg(np.all) return dd.Aggregation(self.name, chunk=chunk, agg=agg) return np.all
def execute_group_concat_series_gb(op, data, sep, _, aggcontext=None, **kwargs): custom_group_concat = dd.Aggregation( name='custom_group_concat', chunk=lambda s: s.apply(list), agg=lambda s0: s0.apply(lambda chunks: sep.join( str(s) for s in itertools.chain.from_iterable(chunks))), ) return data.agg(custom_group_concat)
def get_function(self, agg_type=Library.PANDAS): if agg_type == Library.DASK: def chunk(s): chunk_sum = s.agg(np.sum) if chunk_sum.dtype == 'bool': chunk_sum = chunk_sum.astype('int64') return chunk_sum def agg(s): return s.agg(np.sum) return dd.Aggregation(self.name, chunk=chunk, agg=agg) return np.sum
def test_groupby_agg_custom__name_clash_with_internal_different_column(): """custom aggregation functions can share the name of a builtin function""" d = pd.DataFrame({'g': [0, 0, 1] * 3, 'b': [1, 2, 3] * 3, 'c': [4, 5, 6] * 3}) a = dd.from_pandas(d, npartitions=2) # NOTE: this function is purposefully misnamed agg_func = dd.Aggregation( 'sum', lambda s: (s.count(), s.sum()), lambda s0, s1: (s0.sum(), s1.sum()), lambda s0, s1: s1 / s0, ) # NOTE: the name of agg-func is suppressed in the output, # since only a single agg func per column was specified result = a.groupby('g').aggregate({'b': agg_func, 'c': 'sum'}) expected = d.groupby('g').aggregate({'b': 'mean', 'c': 'sum'}) assert_eq(result, expected, check_dtype=False)
def get_dask_aggregation(self): def chunk(s): def inner_chunk(x): x = x[:].dropna() return set(x.unique()) return s.agg(inner_chunk) def agg(s): def inner_agg(x): x = x[:].dropna() return(set().union(*x.values)) return s.agg(inner_agg) def finalize(s): return s.apply(lambda x: len(x)) return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)
def get_dask_aggregation(self): def chunk(s): def format_chunk(x): return x[:].fillna(0) chunk_sum = s.agg(lambda x: format_chunk(x).sum()) chunk_len = s.agg(lambda x: len(format_chunk(x))) if chunk_sum.dtype == 'bool': chunk_sum = chunk_sum.astype('int64') if chunk_len.dtype == 'bool': chunk_len = chunk_len.astype('int64') return (chunk_sum, chunk_len) def agg(val, length): return (val.sum(), length.sum()) def finalize(total, length): return total / length return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)
def make_datasets(in_csv, out_dir): """Processes csv file and saves a curated dataset to disk. Parameters ---------- in-csv: str path to csv file in local disk out_dir: directory where files should be saved to. Returns ------- None """ log = logging.getLogger('make-dataset') out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) # Connect to the dask cluster log.info( f'Starting make_datasets with in_csv: {in_csv} and out_dir: {out_dir}') log.info('Connecting to cluster') c = Client('dask-scheduler:8786') # load data as a dask Dataframe if you have trouble with dask # please fall back to pandas or numpy log.info('Reading csv file') ddf = dd.read_csv(in_csv, blocksize=1e6) log.info('ouput dataframe head') log.info(ddf.head()) log.info('Trace 1') # we set the index so we can properly execute loc below ddf = ddf.set_index('Unnamed: 0') # trigger computation n_samples = len(ddf) # Fill NaN values with new 'Unknown' category ddf['country'] = ddf['country'].fillna('Unknown') ddf['province'] = ddf['province'].fillna('Unknown') ddf['taster_name'] = ddf['taster_name'].fillna('Unknown') log.info('Trace 2') # Fill region_1 missing values using the 'province' column. # Most common value for each province will be used. Rest are labeled Unknown mode = dd.Aggregation('mode', chunk, agg, finalize) most_common_region = ddf.groupby(['province']).agg({ 'region_1': mode }).compute() ddf['region_1'] = ddf.apply( lambda x: most_common_region.loc[x.province, 'region_1'] if x.province in most_common_region['region_1'].index else 'Unknown', axis=1).where(ddf['region_1'].isna(), ddf['region_1']) log.info('Trace 3') # We fill price values with the province's average price. If that is # not available, we use the global average price mean_prices = ddf.groupby(['province'])['price'].mean().compute() global_mean = ddf['price'].mean().compute() mean_prices = mean_prices.fillna(global_mean) ddf['price'] = ddf.apply(lambda x: mean_prices[x['province']], axis=1, meta=('x', 'f8')).where(ddf['price'].isna(), ddf['price']) # Drop this columns as explained in notebook ddf = ddf.drop([ 'description', 'designation', 'region_2', 'taster_twitter_handle', 'title' ], axis=1) # Encode categorical values using one-hot encoding. # This results in >6k columns. Maybe we'll need to change the encoding type # for some features such as 'winery' with so many unique values. # Also, I think this should be done in the model task. ddf = ddf.categorize() # encoder = DummyEncoder() # ddf = encoder.fit_transform(ddf) # # Normalize price values # scaler = StandardScaler() # ddf['price'] = scaler.fit_transform(ddf[['price']]).price log.info('dataset processed') # split dataset into train test feel free to adjust test percentage idx = np.arange(n_samples) test_idx = idx[:n_samples // 10] test = ddf.loc[test_idx] train_idx = idx[n_samples // 10:] train = ddf.loc[train_idx] # This also shuffles the data. Not sure if csv was shuffled before.. # train, test = ddf.random_split([0.9, 0.1], shuffle=True) _save_datasets(train, test, out_dir)
) from ibis.backends.pandas.execution.arrays import ( execute_array_index, execute_array_length, ) DASK_DISPATCH_TYPES: TypeRegistrationDict = { ops.ArrayLength: [((dd.Series, ), execute_array_length)], ops.ArrayIndex: [((dd.Series, int), execute_array_index)], } register_types_to_dispatcher(execute_node, DASK_DISPATCH_TYPES) collect_list = dd.Aggregation( name="collect_list", chunk=lambda s: s.apply(list), agg=lambda s0: s0.apply(lambda chunks: list( itertools.chain.from_iterable(chunks))), ) @execute_node.register(ops.ArrayColumn, list) def execute_array_column(op, cols, **kwargs): df = dd.concat(cols, axis=1) return df.apply(lambda row: np.array(row, dtype=object), axis=1, meta=(None, 'object')) # TODO - aggregations - #2553 @execute_node.register(ops.ArrayCollect, dd.Series) def execute_array_collect(op, data, aggcontext=None, **kwargs):
def dask_agg_absmax(): return dd.Aggregation(name='absmax', chunk=lambda grouped: abs(grouped.max()), agg=lambda chunk_max: abs(chunk_max.max()))
# Compute on dask DataFrame without divisions (requires shuffling) result = ddf_no_divs.groupby(group_args).apply(apply_func) assert_eq(expected, result, check_divisions=False) # Check that divisions were preserved (all None in this case) assert ddf_no_divs.divisions == result.divisions # Crude check to see if shuffling was performed. # The groupby operation should add only more than 1 task per partition assert len(result.dask) > (len(ddf_no_divs.dask) + ddf_no_divs.npartitions) custom_mean = dd.Aggregation( 'mean', lambda s: (s.count(), s.sum()), lambda s0, s1: (s0.sum(), s1.sum()), lambda s0, s1: s1 / s0, ) custom_sum = dd.Aggregation('sum', lambda s: s.sum(), lambda s0: s0.sum()) @pytest.mark.parametrize('pandas_spec, dask_spec, check_dtype', [ ({'b': 'mean'}, {'b': custom_mean}, False), ({'b': 'sum'}, {'b': custom_sum}, True), (['mean', 'sum'], [custom_mean, custom_sum], False), ({'b': ['mean', 'sum']}, {'b': [custom_mean, custom_sum]}, False), ]) def test_dataframe_groupby_agg_custom_sum(pandas_spec, dask_spec, check_dtype): df = pd.DataFrame({'g': [0, 0, 1] * 3, 'b': [1, 2, 3] * 3}) ddf = dd.from_pandas(df, npartitions=2)
def process_social(file): # Create the output file; one for every day cbgs_out = output_folder + file.split('/')[-1] if os.path.exists(cbgs_out): return None print('loading: ' + file) # Load in Dask DF # dtype={'distance_traveled_from_home': 'float64', } social_df = dd.read_csv(file, error_bad_lines=False, dtype=start_dtype) # social_df = dd.from_pandas(pd.read_csv(file, nrows=10), npartitions=1) # social_df = social_df.fillna(method='ffill') # Create date and origin_fips cols social_df['date_start'] = social_df['date_range_start'].map( lambda x: x[:10]) social_df['date_end'] = social_df['date_range_end'].map(lambda x: x[:10]) social_df['origin_fips'] = social_df['origin_census_block_group'].apply( lambda x: cbgs_to_county_fips(x), meta=('origin_census_block_group', str)) # Groupby and Sum agg_dict = { x: ['min', 'max', 'sum', 'prod', 'mean', 'std'] for x in agg_cols } bucket_agg = dd.Aggregation( 'join', lambda x: x.agg(''.join), lambda x0: x0.agg(''.join), ) bucket_agg2 = dd.Aggregation( 'new', lambda x: x.agg(lambda x: dict_flatten2(x, num_fips)), lambda x0: x0.agg(''.join), ) bucket_dict = { x: bucket_agg for x in bucket_cols + home_cols + destination_cols } for col in bucket_cols: social_df[col] = social_df[col].astype(str) social_df = social_df[groupby_cols + bucket_cols + agg_cols + home_cols + destination_cols] \ .groupby(groupby_cols) \ .agg(dict(agg_dict, **bucket_dict)) # most efficient way to add two dicts # print(social_df.compute()) # Kill the dreaded MultiIndex social_df.columns = [ '_'.join(col).strip() for col in social_df.columns.values ] social_df = social_df.reset_index() # Redo MetaData for col in bucket_cols: # print(col) social_df[col + '_join'] = social_df[col + '_join'].astype(str) for bucket in bucket_cols: cols = bucket_col_dict[bucket] raw_cols = raw_bucket_col_dict[bucket] col = bucket + '_join' social_df[cols] = social_df.map_partitions( lambda x: x[col].apply(lambda z: dict_flatten3(z, raw_cols, cols))) social_df = social_df.drop(col, axis=1) social_df = social_df.compute() social_df[destination_fips_cols] = social_df[ 'destination_cbgs_join'].apply( lambda z: cbgs_dict_flatten2(z, num_fips)) print(social_df) print('uploading 10000000 lines of data') social_df.to_csv(cbgs_out #,single_file = True # chunksize=chunksize )
class LogicalAggregatePlugin(BaseRelPlugin): """ A LogicalAggregate is used in GROUP BY clauses, but also when aggregating a function over the full dataset. In the first case we need to find out which columns we need to group over, in the second case we "cheat" and add a 1-column to the dataframe, which allows us to reuse every aggregation function we already know of. The rest is just a lot of column-name-bookkeeping. Fortunately calcite will already make sure, that each aggregation function will only every be called with a single input column (by splitting the inner calculation to a step before). """ class_name = "org.apache.calcite.rel.logical.LogicalAggregate" AGGREGATION_MAPPING = { "$sum0": "sum", "any_value": dd.Aggregation( "any_value", lambda s: s.sample(n=1).values, lambda s0: s0.sample(n=1).values, ), "avg": "mean", "bit_and": ReduceAggregation("bit_and", operator.and_), "bit_or": ReduceAggregation("bit_or", operator.or_), "bit_xor": ReduceAggregation("bit_xor", operator.xor), "count": "count", "every": dd.Aggregation("every", lambda s: s.all(), lambda s0: s0.all()), "max": "max", "min": "min", "single_value": "first", } def convert(self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context") -> DataContainer: (dc, ) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # We make our life easier with having unique column names cc = cc.make_unique() # I have no idea what that is, but so far it was always of length 1 assert len( rel.getGroupSets()) == 1, "Do not know how to handle this case!" # Extract the information, which columns we need to group for group_column_indices = [int(i) for i in rel.getGroupSet()] group_columns = [ cc.get_backend_by_frontend_index(i) for i in group_column_indices ] # Always keep an additional column around for empty groups and aggregates additional_column_name = str(uuid.uuid4()) # NOTE: it might be the case that # we do not need this additional # column, but hopefully adding a single # column of 1 is not so problematic... df = df.assign(**{additional_column_name: 1}) cc = cc.add(additional_column_name) dc = DataContainer(df, cc) # Collect all aggregates filtered_aggregations, output_column_order = self._collect_aggregations( rel, dc, group_columns, additional_column_name, context) if not group_columns: # There was actually no GROUP BY specified in the SQL # Still, this plan can also be used if we need to aggregate something over the full # data sample # To reuse the code, we just create a new column at the end with a single value # It is important to do this after creating the aggregations, # as we do not want this additional column to be used anywhere group_columns = [additional_column_name] logger.debug("Performing full-table aggregation") # Now we can perform the aggregates # We iterate through all pairs of (possible pre-filtered) # dataframes and the aggregations to perform in this data... df_agg = None for filtered_df_desc, aggregation in filtered_aggregations.items(): filtered_column = filtered_df_desc.filtered_column if filtered_column: logger.debug( f"Aggregating {dict(aggregation)} on the data filtered by {filtered_column}" ) else: logger.debug(f"Aggregating {dict(aggregation)} on the data") # ... we perform the aggregations ... filtered_df = filtered_df_desc.df # TODO: we could use the type information for # pre-calculating the meta information filtered_df_agg = filtered_df.groupby( by=group_columns).agg(aggregation) # ... fix the column names to a single level ... filtered_df_agg.columns = filtered_df_agg.columns.get_level_values( -1) # ... and finally concat the new data with the already present columns if df_agg is None: df_agg = filtered_df_agg else: df_agg = df_agg.assign(**{ col: filtered_df_agg[col] for col in filtered_df_agg.columns }) # SQL does not care about the index, but we do not want to have any multiindices df_agg = df_agg.reset_index(drop=True) # Fix the column names and the order of them, as this was messed with during the aggregations df_agg.columns = df_agg.columns.get_level_values(-1) cc = ColumnContainer(df_agg.columns).limit_to(output_column_order) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df_agg, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc def _collect_aggregations( self, rel: "org.apache.calcite.rel.RelNode", dc: DataContainer, group_columns: List[str], additional_column_name: str, context: "dask_sql.Context", ) -> Tuple[Dict[GroupDatasetDescription, AggregationDescription], List[int], ]: """ Create a mapping of dataframe -> aggregations (in the form input colum, output column, aggregation) and the expected order of output columns. """ aggregations = defaultdict(lambda: defaultdict(dict)) output_column_order = [] df = dc.df cc = dc.column_container # SQL needs to copy the old content also. As the values of the group columns # are the same for a single group anyways, we just use the first row for col in group_columns: aggregations[GroupDatasetDescription(df)][col][col] = "first" output_column_order.append(col) # Now collect all aggregations for agg_call in rel.getNamedAggCalls(): output_col = str(agg_call.getValue()) expr = agg_call.getKey() if expr.hasFilter(): filter_column = cc.get_backend_by_frontend_index( expr.filterArg) filter_expression = df[filter_column] filtered_df = df[filter_expression] grouped_df = GroupDatasetDescription(filtered_df, filter_column) else: grouped_df = GroupDatasetDescription(df) if expr.isDistinct(): raise NotImplementedError( "DISTINCT is not implemented (yet)") # pragma: no cover aggregation_name = str(expr.getAggregation().getName()) aggregation_name = aggregation_name.lower() try: aggregation_function = self.AGGREGATION_MAPPING[ aggregation_name] except KeyError: try: aggregation_function = context.functions[ aggregation_name].f except KeyError: # pragma: no cover raise NotImplementedError( f"Aggregation function {aggregation_name} not implemented (yet)." ) inputs = expr.getArgList() if len(inputs) == 1: input_col = cc.get_backend_by_frontend_index(inputs[0]) elif len(inputs) == 0: input_col = additional_column_name else: raise NotImplementedError( "Can not cope with more than one input" ) # pragma: no cover aggregations[grouped_df][input_col][ output_col] = aggregation_function output_column_order.append(output_col) return aggregations, output_column_order
def get_annotations(self, index: str, columns: list, agg: str = "concat", filter_values: pd.Series = None): """Returns the Database's DataFrame such that it's indexed by :param index:, which then applies a groupby operation and aggregates all other columns by concatenating all unique values. Args: index (str): The column name of the DataFrame to join by. columns (list): a list of column names. agg (str): Function to aggregate when there is more than one values for each index instance. E.g. ['first', 'last', 'sum', 'mean', 'size', 'concat'], default 'concat'. filter_values (pd.Series): The values on the `index` column to filter before performing the groupby-agg operations. Returns: DataFrame: A dataframe to be used for annotation """ if not set(columns).issubset(set(self.data.columns)): raise Exception( "The columns argument must be a list such that it's subset of the following columns in the dataframe", "These columns doesn't exist in database:", set(columns) - set(self.data.columns.tolist())) # Select df columns including df. However the `columns` list shouldn't contain the index column if index in columns: columns.pop(columns.index(index)) df = self.data[columns + [index]] if filter_values is not None: df = df[df[index].isin(list(filter_values))] # if index != self.data.index.name and index in self.data.columns: # df = df.set_index(index) # Groupby index groupby = df.groupby(index) # Aggregate by all columns by concatenating unique values if agg == "concat": if isinstance(df, pd.DataFrame): aggregated = groupby.agg( {col: concat_uniques for col in columns}) elif isinstance(df, dd.DataFrame): collect_concat = dd.Aggregation( name='collect_concat', chunk=lambda s1: s1.apply(list), agg=lambda s2: s2.apply(lambda chunks: filter( lambda x: False if x == "None" or x is None else True, set(itertools.chain.from_iterable(chunks)))), finalize=lambda s3: s3.apply(lambda xx: '|'.join(xx))) aggregated = groupby.agg( {col: collect_concat for col in columns}) else: raise Exception("Unsupported dataframe: {}".format(df)) # Any other aggregation functions else: aggregated = groupby.agg({col: agg for col in columns}) # if aggregated.index.duplicated().sum() > 0: # raise ValueError("DataFrame must not have duplicates in index") return aggregated
cfuns = { "name" : 'category', "progtype" : 'category' } y = dd.read_csv(okfile, sep = ';', dtype = cfuns, parse_dates = ['lastdate']) y.head() y.name.unique().compute() y.user.unique().compute() uni_len = dd.Aggregation( name = 'uni_len', chunk = lambda x : x.unique(), agg = lambda xa : len(xa) ) a = y.groupby('name').agg({'lang':sum,'times':sum, 'user': uni_len}).compute() y['hour'] = y.lastdate.dt.hour y.hour = y['hour'].cat.as_known() y.name = y['name'].cat.as_known() a = y.pivot_table(index='name', columns='hour', values='lang', aggfunc='sum').compute() a.sort_values(by=['0']) a.to_excel("E:/pivot_hour.xlsx")
class LogicalAggregatePlugin(BaseRelPlugin): """ A LogicalAggregate is used in GROUP BY clauses, but also when aggregating a function over the full dataset. In the first case we need to find out which columns we need to group over, in the second case we "cheat" and add a 1-column to the dataframe, which allows us to reuse every aggregation function we already know of. As NULLs are not groupable in dask, we handle them special by adding a temporary column which is True for all NULL values and False otherwise (and also group by it). The rest is just a lot of column-name-bookkeeping. Fortunately calcite will already make sure, that each aggregation function will only every be called with a single input column (by splitting the inner calculation to a step before). Open TODO: So far we are following the dask default to only have a single partition after the group by (which is usual a reasonable assumption). It would be nice to control these things via HINTs. """ class_name = "org.apache.calcite.rel.logical.LogicalAggregate" AGGREGATION_MAPPING = { "$sum0": AggregationSpecification("sum", AggregationOnPandas("sum")), "any_value": AggregationSpecification( dd.Aggregation( "any_value", lambda s: s.sample(n=1).values, lambda s0: s0.sample(n=1).values, ) ), "avg": AggregationSpecification("mean", AggregationOnPandas("mean")), "bit_and": AggregationSpecification( ReduceAggregation("bit_and", operator.and_) ), "bit_or": AggregationSpecification(ReduceAggregation("bit_or", operator.or_)), "bit_xor": AggregationSpecification(ReduceAggregation("bit_xor", operator.xor)), "count": AggregationSpecification("count"), "every": AggregationSpecification( dd.Aggregation("every", lambda s: s.all(), lambda s0: s0.all()) ), "max": AggregationSpecification("max", AggregationOnPandas("max")), "min": AggregationSpecification("min", AggregationOnPandas("min")), "single_value": AggregationSpecification("first"), } def convert( self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context" ) -> DataContainer: (dc,) = self.assert_inputs(rel, 1, context) df = dc.df cc = dc.column_container # We make our life easier with having unique column names cc = cc.make_unique() # I have no idea what that is, but so far it was always of length 1 assert len(rel.getGroupSets()) == 1, "Do not know how to handle this case!" # Extract the information, which columns we need to group for group_column_indices = [int(i) for i in rel.getGroupSet()] group_columns = [ cc.get_backend_by_frontend_index(i) for i in group_column_indices ] dc = DataContainer(df, cc) if not group_columns: # There was actually no GROUP BY specified in the SQL # Still, this plan can also be used if we need to aggregate something over the full # data sample # To reuse the code, we just create a new column at the end with a single value logger.debug("Performing full-table aggregation") # Do all aggregates df_result, output_column_order = self._do_aggregations( rel, dc, group_columns, context, ) # SQL does not care about the index, but we do not want to have any multiindices df_agg = df_result.reset_index(drop=True) # Fix the column names and the order of them, as this was messed with during the aggregations df_agg.columns = df_agg.columns.get_level_values(-1) cc = ColumnContainer(df_agg.columns).limit_to(output_column_order) cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df_agg, cc) dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc def _do_aggregations( self, rel: "org.apache.calcite.rel.RelNode", dc: DataContainer, group_columns: List[str], context: "dask_sql.Context", ) -> Tuple[dd.DataFrame, List[str]]: """ Main functionality: return the result dataframe and the output column order """ df = dc.df cc = dc.column_container # We might need it later. # If not, lets hope that adding a single column should not # be a huge problem... additional_column_name = new_temporary_column(df) df = df.assign(**{additional_column_name: 1}) # Add an entry for every grouped column, as SQL wants them first output_column_order = group_columns.copy() # Collect all aggregations we need to do collected_aggregations, output_column_order = self._collect_aggregations( rel, df, cc, context, additional_column_name, output_column_order ) if not collected_aggregations: return df[group_columns].drop_duplicates(), output_column_order # SQL needs to have a column with the grouped values as the first # output column. # As the values of the group columns # are the same for a single group anyways, we just use the first row for col in group_columns: collected_aggregations[None].append((col, col, "first")) # Now we can go ahead and use these grouped aggregations # to perform the actual aggregation # It is very important to start with the non-filtered entry. # Otherwise we might loose some entries in the grouped columns df_result = None key = None if key in collected_aggregations: aggregations = collected_aggregations.pop(key) df_result = self._perform_aggregation( df, None, aggregations, additional_column_name, group_columns, ) # Now we can also the the rest for filter_column, aggregations in collected_aggregations.items(): agg_result = self._perform_aggregation( df, filter_column, aggregations, additional_column_name, group_columns, ) # ... and finally concat the new data with the already present columns if df_result is None: df_result = agg_result else: df_result = df_result.assign( **{col: agg_result[col] for col in agg_result.columns} ) return df_result, output_column_order def _collect_aggregations( self, rel: "org.apache.calcite.rel.RelNode", df: dd.DataFrame, cc: ColumnContainer, context: "dask_sql.Context", additional_column_name: str, output_column_order: List[str], ) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str]]: """ Collect all aggregations together, which have the same filter column so that the aggregations only need to be done once. Returns the aggregations as mapping filter_column -> List of Aggregations where the aggregations are in the form (input_col, output_col, aggregation function (or string)) """ collected_aggregations = defaultdict(list) for agg_call in rel.getNamedAggCalls(): expr = agg_call.getKey() # Find out about the input column inputs = expr.getArgList() if len(inputs) == 1: input_col = cc.get_backend_by_frontend_index(inputs[0]) elif len(inputs) == 0: input_col = additional_column_name else: raise NotImplementedError("Can not cope with more than one input") # Extract flags (filtering/distinct) if expr.isDistinct(): # pragma: no cover raise ValueError("Apache Calcite should optimize them away!") filter_column = None if expr.hasFilter(): filter_column = cc.get_backend_by_frontend_index(expr.filterArg) # Find out which aggregation function to use aggregation_name = str(expr.getAggregation().getName()) aggregation_name = aggregation_name.lower() try: aggregation_function = self.AGGREGATION_MAPPING[aggregation_name] except KeyError: try: aggregation_function = context.functions[aggregation_name] except KeyError: # pragma: no cover raise NotImplementedError( f"Aggregation function {aggregation_name} not implemented (yet)." ) if isinstance(aggregation_function, AggregationSpecification): dtype = df[input_col].dtype if pd.api.types.is_numeric_dtype(dtype): aggregation_function = aggregation_function.numerical_aggregation else: aggregation_function = ( aggregation_function.non_numerical_aggregation ) # Finally, extract the output column name output_col = str(agg_call.getValue()) # Store the aggregation key = filter_column value = (input_col, output_col, aggregation_function) collected_aggregations[key].append(value) output_column_order.append(output_col) return collected_aggregations, output_column_order def _perform_aggregation( self, df: dd.DataFrame, filter_column: str, aggregations: List[Tuple[str, str, Any]], additional_column_name: str, group_columns: List[str], ): tmp_df = df if filter_column: filter_expression = tmp_df[filter_column] tmp_df = tmp_df[filter_expression] logger.debug(f"Filtered by {filter_column} before aggregation.") group_columns = [tmp_df[group_column] for group_column in group_columns] group_columns_and_nulls = get_groupby_with_nulls_cols( tmp_df, group_columns, additional_column_name ) grouped_df = tmp_df.groupby(by=group_columns_and_nulls) # Convert into the correct format for dask aggregations_dict = defaultdict(dict) for aggregation in aggregations: input_col, output_col, aggregation_f = aggregation aggregations_dict[input_col][output_col] = aggregation_f # Now apply the aggregation logger.debug(f"Performing aggregation {dict(aggregations_dict)}") agg_result = grouped_df.agg(aggregations_dict) # ... fix the column names to a single level ... agg_result.columns = agg_result.columns.get_level_values(-1) return agg_result
df_schema = pd.read_json("/srv/retail_schema.json", lines=True) def json_engine(*args, **kwargs): df = pd.read_json(*args, **kwargs) for c in set(df_schema.columns) - set(df.columns): df[c] = pd.Series(dtype=df_schema[c].dtype) df.drop(set(df.columns) - set(df_schema.columns)) return df.loc[:, df_schema.columns] nunique = dd.Aggregation( name="nunique", chunk=lambda s: s.apply(lambda x: list(set(x))), agg=lambda s0: s0._selected_obj.groupby(level=list( range(s0._selected_obj.index.nlevels))).sum(), finalize=lambda s1: s1.apply(lambda final: len(set(final))), ) df = dd.read_json( "s3://retail-bucket/topics/retail/**.json", lines=True, engine=json_engine, storage_options={ "key": "access_me", "secret": "i_am_a_secret", "client_kwargs": { "endpoint_url": "http://minio:9000" }, },