def sort_distributed_analysis(sort_node, array_dists): in_arrs = sort_node.key_arrs + list(sort_node.df_in_vars.values()) out_arrs = sort_node.out_key_arrs + list(sort_node.df_out_vars.values()) # input columns have same distribution in_dist = Distribution.OneD for col_var in in_arrs: in_dist = Distribution( min(in_dist.value, array_dists[col_var.name].value)) # output is 1D_Var due to shuffle, has to meet input dist # TODO: set to input dist in inplace case out_dist = Distribution(min(in_dist.value, Distribution.OneD_Var.value)) for col_var in out_arrs: if col_var.name in array_dists: out_dist = Distribution( min(out_dist.value, array_dists[col_var.name].value)) # output can cause input REP if out_dist != Distribution.OneD_Var: in_dist = out_dist # set dists for col_var in in_arrs: array_dists[col_var.name] = in_dist for col_var in out_arrs: array_dists[col_var.name] = out_dist # TODO: handle rebalance # assert not (in_dist == Distribution.OneD and out_dist == Distribution.OneD_Var) return
def filter_distributed_analysis(filter_node, array_dists): # input columns have same distribution in_dist = Distribution.OneD for _, col_var in filter_node.df_in_vars.items(): in_dist = Distribution(min(in_dist.value, array_dists[col_var.name].value)) # bool arr in_dist = Distribution(min(in_dist.value, array_dists[filter_node.bool_arr.name].value)) for _, col_var in filter_node.df_in_vars.items(): array_dists[col_var.name] = in_dist array_dists[filter_node.bool_arr.name] = in_dist # output columns have same distribution out_dist = Distribution.OneD_Var for _, col_var in filter_node.df_out_vars.items(): # output dist might not be assigned yet if col_var.name in array_dists: out_dist = Distribution(min(out_dist.value, array_dists[col_var.name].value)) # out dist should meet input dist (e.g. REP in causes REP out) out_dist = Distribution(min(out_dist.value, in_dist.value)) for _, col_var in filter_node.df_out_vars.items(): array_dists[col_var.name] = out_dist # output can cause input REP if out_dist != Distribution.OneD_Var: array_dists[filter_node.bool_arr.name] = out_dist for _, col_var in filter_node.df_in_vars.items(): array_dists[col_var.name] = out_dist return
def join_distributed_analysis(join_node, array_dists): # input columns have same distribution in_dist = Distribution.OneD for _, col_var in (list(join_node.left_vars.items()) + list(join_node.right_vars.items())): in_dist = Distribution( min(in_dist.value, array_dists[col_var.name].value)) # output columns have same distribution out_dist = Distribution.OneD_Var for _, col_var in join_node.df_out_vars.items(): # output dist might not be assigned yet if col_var.name in array_dists: out_dist = Distribution( min(out_dist.value, array_dists[col_var.name].value)) # out dist should meet input dist (e.g. REP in causes REP out) out_dist = Distribution(min(out_dist.value, in_dist.value)) for _, col_var in join_node.df_out_vars.items(): array_dists[col_var.name] = out_dist # output can cause input REP if out_dist != Distribution.OneD_Var: in_dist = out_dist # assign input distributions for _, col_var in (list(join_node.left_vars.items()) + list(join_node.right_vars.items())): array_dists[col_var.name] = in_dist return
def join_distributed_analysis(join_node, array_dists): # TODO: can columns of the same input table have diffrent dists? # left and right inputs can have 1D or 1D_Var seperately (q26 case) # input columns have same distribution left_dist = Distribution.OneD right_dist = Distribution.OneD for col_var in join_node.left_vars.values(): left_dist = Distribution( min(left_dist.value, array_dists[col_var.name].value)) for col_var in join_node.right_vars.values(): right_dist = Distribution( min(right_dist.value, array_dists[col_var.name].value)) # output columns have same distribution out_dist = Distribution.OneD_Var for col_var in join_node.df_out_vars.values(): # output dist might not be assigned yet if col_var.name in array_dists: out_dist = Distribution( min(out_dist.value, array_dists[col_var.name].value)) # out dist should meet input dist (e.g. REP in causes REP out) out_dist = Distribution(min(out_dist.value, left_dist.value)) out_dist = Distribution(min(out_dist.value, right_dist.value)) for col_var in join_node.df_out_vars.values(): array_dists[col_var.name] = out_dist # output can cause input REP if out_dist != Distribution.OneD_Var: left_dist = out_dist right_dist = out_dist # assign input distributions for col_var in join_node.left_vars.values(): array_dists[col_var.name] = left_dist for col_var in join_node.right_vars.values(): array_dists[col_var.name] = right_dist return
def filter_distributed_analysis(filter_node, array_dists): df_vars = filter_node.df_vars df_in_vars = df_vars[filter_node.df_in] df_out_vars = df_vars[filter_node.df_out] # input columns have same distribution in_dist = Distribution.OneD for _, col_var in df_in_vars.items(): in_dist = Distribution(min(in_dist.value, array_dists[col_var.name].value)) for _, col_var in df_in_vars.items(): array_dists[col_var.name] = in_dist # output columns have same distribution out_dist = Distribution.OneD_Var for _, col_var in df_out_vars.items(): # output dist might not be assigned yet if col_var.name in array_dists: out_dist = Distribution(min(out_dist.value, array_dists[col_var.name].value)) for _, col_var in df_out_vars.items(): array_dists[col_var.name] = out_dist return
def sort_distributed_analysis(sort_node, array_dists): # input columns have same distribution in_dist = array_dists[sort_node.key_arr.name] for col_var in sort_node.df_vars.values(): in_dist = Distribution( min(in_dist.value, array_dists[col_var.name].value)) # set dists for col_var in sort_node.df_vars.values(): array_dists[col_var.name] = in_dist array_dists[sort_node.key_arr.name] = in_dist return