def apply_selection_from_single_df( self, ops: RelationalOp, df_name: DFName, selections: List[SelectionValue]) -> RelationalOp: # here we can assume that all the selections have the same df bases = find_all_baseops(ops) # see if the selection list has anything in the bases non_join_base_list = list(filter(lambda b: b.df_name == df_name, bases)) if len(non_join_base_list) > 0: non_join_base = non_join_base_list[0] local_base_df_name = non_join_base.df_name replacement_op = apply_non_join_selection(non_join_base, selections) else: # search for which one we can actually join with r = self.find_joinable_base(bases, df_name) if r: # it's always the right one (by construct) local_base_df_name = r.left_df.df_name replacement_op = self.apply_join_selection(r, selections) else: # NO OP if ISDEBUG: debug_log( f"No op for {df_name} selection because no join was found" ) return ops # 2. apply the replacement if replacement_op and local_base_df_name: return set_if_eq(deepcopy(ops), replacement_op, local_base_df_name) raise InternalLogicalError( "Replacement Op is not set or the df_name is not set")
def apply_selection(self, target_df: MidasDataFrame, selections: List[SelectionValue], is_union=False) -> Optional[MidasDataFrame]: if len(selections) == 0: return None # before we do any of that, just check to see if the filter is directly on the target_df itself? selections_on_base = map(self.get_base_df_selection, selections) selections_by_df = defaultdict(list) for s in selections_on_base: # note that if something is not found, we simply ignore it # this sometimes happens when we miscategorize. if s is not None: selections_by_df[s.column.df_name].append(s) if target_df.df_name in selections_by_df: raise InternalLogicalError( f"Shouldn't be using context to do the filter if the two DFs are the same, we got {target_df.df_name} as target, which is in {selections_by_df.keys()}" ) new_ops = target_df._ops # it doesn't really matter what order we apply these in for df_name in selections_by_df.keys(): new_ops = self.apply_selection_from_single_df( new_ops, df_name, selections_by_df[df_name]) # type: ignore new_df = target_df.new_df_from_ops(new_ops) # type: ignore return new_df
def __init__(self, df: MidasDataFrame): if not df.df_name: raise InternalLogicalError("Visualized dfs must have df_names") self.df = df self.created_on = datetime.now() self.df_name = df.df_name # original df is that which was defined at the beginning self.original_df = df self.df_type = "visualized"
def get_df(self, df_name: DFName) -> MidasDataFrame: found = self.df_info_store[df_name] if found: if isinstance(found, VisualizedDFInfo): return found.original_df else: return found.df else: raise InternalLogicalError(f"DF {df_name} not found")
def add_join_info(self, joins: JoinInfo): left_df = joins.left_df right_df = joins.right_df if left_df.df_name is not None and right_df.df_name is not None: # type: ignore self.join_info[(left_df.df_name, right_df.df_name)] = joins # type: ignore self.join_info[( right_df.df_name, left_df.df_name)] = joins.swap_left_right() # type: ignore else: raise InternalLogicalError( "The DFs with join info should have df_names")
def get_midas_code(op: RelationalOp, midas_reference_name: str) -> str: if op.op_type == RelationalOpType.base: b_op = cast(BaseOp, op) return b_op.df_name else: prev_table = get_midas_code(op.child, midas_reference_name) if op.op_type == RelationalOpType.where: s_op = cast(Where, op) col_or_label = convert_value_or_predicate( s_op.predicate.column_or_label, midas_reference_name) val_or_pred = convert_value_or_predicate( s_op.predicate.value_or_predicate, midas_reference_name) if s_op.predicate.other is None: return f"{prev_table}.where({col_or_label}, {val_or_pred})" else: other = convert_value_or_predicate(s_op.predicate.other, midas_reference_name) return f"{prev_table}.where({col_or_label}, {val_or_pred}, {other})" if op.op_type == RelationalOpType.project: p_op = cast(Select, op) new_table = f"{prev_table}.select({p_op.columns!r})" return new_table if op.op_type == RelationalOpType.groupby: g_op = cast(GroupBy, op) if g_op.collect is None: return f"{prev_table}.group({g_op.columns!r})" else: group_fun = get_lambda_declaration_or_fn_name(g_op.collect) return f"{prev_table}.group({g_op.columns!r}, {group_fun})" if op.op_type == RelationalOpType.join: j_op = cast(Join, op) join_prep_code = "" # we assume that the other has data! if j_op.other.df_name is not None: other_df_name = j_op.other.df_name else: if not (hasattr(j_op.other, "_suggested_df_name") or hasattr( j_op.other._suggested_df_name, "_suggested_df_name")): raise InternalLogicalError( "the join df should have a suggested name") ops_code = get_midas_code(j_op.other._ops, midas_reference_name) join_prep_code = f"{j_op.other._suggested_df_name} = {ops_code}" other_df_name = j_op.other._suggested_df_name new_table = f"{join_prep_code}\n{prev_table}.join({j_op.self_columns!r}, {other_df_name}, {j_op.other_columns!r})" return new_table else: raise NotImplementedError(op.op_type)
def _helper(op: RelationalOp, new_op: RelationalOp, parent_op: Optional[RelationalOp]): if (op.op_type == RelationalOpType.base): base_op = cast(BaseOp, op) if (base_op.df_name == df_name): # if parent_op is not defined, then we are literally replacing if parent_op is None: should_return_replacement = True return else: parent_op.child = new_op return elif (op.has_child()): return _helper(op.child, new_op, op) else: raise InternalLogicalError( "Should either have child or be of base type")
def to_str(self): raise InternalLogicalError("Should not try to make empty selections into strings")
def to_str(self): raise InternalLogicalError("SelectionValue is abstract and should not be instantiated")