Ejemplo n.º 1
0
    def apply_selection_from_single_df(
            self, ops: RelationalOp, df_name: DFName,
            selections: List[SelectionValue]) -> RelationalOp:
        # here we can assume that all the selections have the same df
        bases = find_all_baseops(ops)
        # see if the selection list has anything in the bases
        non_join_base_list = list(filter(lambda b: b.df_name == df_name,
                                         bases))
        if len(non_join_base_list) > 0:
            non_join_base = non_join_base_list[0]
            local_base_df_name = non_join_base.df_name
            replacement_op = apply_non_join_selection(non_join_base,
                                                      selections)
        else:
            # search for which one we can actually join with
            r = self.find_joinable_base(bases, df_name)
            if r:
                # it's always the right one (by construct)
                local_base_df_name = r.left_df.df_name
                replacement_op = self.apply_join_selection(r, selections)
            else:
                # NO OP
                if ISDEBUG:
                    debug_log(
                        f"No op for {df_name} selection because no join was found"
                    )
                return ops

        # 2. apply the replacement
        if replacement_op and local_base_df_name:
            return set_if_eq(deepcopy(ops), replacement_op, local_base_df_name)
        raise InternalLogicalError(
            "Replacement Op is not set or the df_name is not set")
Ejemplo n.º 2
0
    def apply_selection(self,
                        target_df: MidasDataFrame,
                        selections: List[SelectionValue],
                        is_union=False) -> Optional[MidasDataFrame]:
        if len(selections) == 0:
            return None
        # before we do any of that, just check to see if the filter is directly on the target_df itself?
        selections_on_base = map(self.get_base_df_selection, selections)
        selections_by_df = defaultdict(list)
        for s in selections_on_base:
            # note that if something is not found, we simply ignore it
            # this sometimes happens when we miscategorize.
            if s is not None:
                selections_by_df[s.column.df_name].append(s)

        if target_df.df_name in selections_by_df:
            raise InternalLogicalError(
                f"Shouldn't be using context to do the filter if the two DFs are the same, we got {target_df.df_name} as target, which is in {selections_by_df.keys()}"
            )

        new_ops = target_df._ops
        # it doesn't really matter what order we apply these in
        for df_name in selections_by_df.keys():
            new_ops = self.apply_selection_from_single_df(
                new_ops, df_name, selections_by_df[df_name])  # type: ignore
        new_df = target_df.new_df_from_ops(new_ops)  # type: ignore
        return new_df
Ejemplo n.º 3
0
 def __init__(self, df: MidasDataFrame):
     if not df.df_name:
         raise InternalLogicalError("Visualized dfs must have df_names")
     self.df = df
     self.created_on = datetime.now()
     self.df_name = df.df_name
     # original df is that which was defined at the beginning
     self.original_df = df
     self.df_type = "visualized"
Ejemplo n.º 4
0
 def get_df(self, df_name: DFName) -> MidasDataFrame:
     found = self.df_info_store[df_name]
     if found:
         if isinstance(found, VisualizedDFInfo):
             return found.original_df
         else:
             return found.df
     else:
         raise InternalLogicalError(f"DF {df_name} not found")
Ejemplo n.º 5
0
 def add_join_info(self, joins: JoinInfo):
     left_df = joins.left_df
     right_df = joins.right_df
     if left_df.df_name is not None and right_df.df_name is not None:  # type: ignore
         self.join_info[(left_df.df_name,
                         right_df.df_name)] = joins  # type: ignore
         self.join_info[(
             right_df.df_name,
             left_df.df_name)] = joins.swap_left_right()  # type: ignore
     else:
         raise InternalLogicalError(
             "The DFs with join info should have df_names")
Ejemplo n.º 6
0
def get_midas_code(op: RelationalOp, midas_reference_name: str) -> str:
    if op.op_type == RelationalOpType.base:
        b_op = cast(BaseOp, op)
        return b_op.df_name
    else:
        prev_table = get_midas_code(op.child, midas_reference_name)

        if op.op_type == RelationalOpType.where:
            s_op = cast(Where, op)
            col_or_label = convert_value_or_predicate(
                s_op.predicate.column_or_label, midas_reference_name)

            val_or_pred = convert_value_or_predicate(
                s_op.predicate.value_or_predicate, midas_reference_name)

            if s_op.predicate.other is None:
                return f"{prev_table}.where({col_or_label}, {val_or_pred})"
            else:
                other = convert_value_or_predicate(s_op.predicate.other,
                                                   midas_reference_name)
                return f"{prev_table}.where({col_or_label}, {val_or_pred}, {other})"
        if op.op_type == RelationalOpType.project:
            p_op = cast(Select, op)
            new_table = f"{prev_table}.select({p_op.columns!r})"
            return new_table
        if op.op_type == RelationalOpType.groupby:
            g_op = cast(GroupBy, op)
            if g_op.collect is None:
                return f"{prev_table}.group({g_op.columns!r})"
            else:
                group_fun = get_lambda_declaration_or_fn_name(g_op.collect)
                return f"{prev_table}.group({g_op.columns!r}, {group_fun})"
        if op.op_type == RelationalOpType.join:
            j_op = cast(Join, op)
            join_prep_code = ""
            # we assume that the other has data!
            if j_op.other.df_name is not None:
                other_df_name = j_op.other.df_name
            else:
                if not (hasattr(j_op.other, "_suggested_df_name") or hasattr(
                        j_op.other._suggested_df_name, "_suggested_df_name")):
                    raise InternalLogicalError(
                        "the join df should have a suggested name")
                ops_code = get_midas_code(j_op.other._ops,
                                          midas_reference_name)
                join_prep_code = f"{j_op.other._suggested_df_name} = {ops_code}"
                other_df_name = j_op.other._suggested_df_name
            new_table = f"{join_prep_code}\n{prev_table}.join({j_op.self_columns!r}, {other_df_name}, {j_op.other_columns!r})"
            return new_table
        else:
            raise NotImplementedError(op.op_type)
Ejemplo n.º 7
0
 def _helper(op: RelationalOp, new_op: RelationalOp,
             parent_op: Optional[RelationalOp]):
     if (op.op_type == RelationalOpType.base):
         base_op = cast(BaseOp, op)
         if (base_op.df_name == df_name):
             # if parent_op is not defined, then we are literally replacing
             if parent_op is None:
                 should_return_replacement = True
                 return
             else:
                 parent_op.child = new_op
                 return
     elif (op.has_child()):
         return _helper(op.child, new_op, op)
     else:
         raise InternalLogicalError(
             "Should either have child or be of base type")
Ejemplo n.º 8
0
 def to_str(self):
     raise InternalLogicalError("Should not try to make empty selections into strings")
Ejemplo n.º 9
0
 def to_str(self):
     raise InternalLogicalError("SelectionValue is abstract and should not be instantiated")