Esempio n. 1
0
    def visit_individual_node(
            self, node: IndividualNode[Entity]) -> EquivalenceGraph:
        ret: EquivalenceGraph = {}
        entity = get_entity(entity_from_node(node))

        for relationship in entity.get_all_join_relationships().values():
            if relationship.rhs_entity in self.__entities:
                for equivalence in relationship.equivalences:
                    self.__add_relationship(
                        ret,
                        entity_from_node(node),
                        equivalence.left_col,
                        relationship.rhs_entity,
                        equivalence.right_col,
                    )
        return ret
Esempio n. 2
0
    def visit_join_clause(self, node: JoinClause[Entity]) -> EquivalenceGraph:
        ret: EquivalenceGraph = {}
        mapping = node.get_alias_node_map()
        for condition in node.keys:
            self.__add_relationship(
                ret,
                entity_from_node(mapping[condition.left.table_alias]),
                condition.left.column,
                entity_from_node(mapping[condition.right.table_alias]),
                condition.right.column,
            )

        def merge_into_graph(node: JoinNode[Entity]) -> None:
            for col, equivalences in node.accept(self).items():
                ret[col] = ret.get(col, set()) | equivalences

        merge_into_graph(node.left_node)
        merge_into_graph(node.right_node)
        return ret
Esempio n. 3
0
def get_equivalent_columns(
    join: JoinClause[Entity], ) -> Mapping[QualifiedCol, Set[QualifiedCol]]:
    """
    Given a Join, it returns the set of all the semantically equivalent
    columns across the entities involved in the join.

    This is obtained by generating, through EquivalenceExtractor, the
    graph of all equivalences.
    We then have the sets of semantically equivalent columns by
    generating the list of connected components in the equivalence graph

    Each node in a connected component of the equivalence graph is by
    definition, semantically equivalent to all the nodes of the same
    connected component (directly if there is an edge between two columns
    or transitively).

    The connected components are returned as a Mapping of nodes to the
    set of the nodes in the same component, which means the nodes in the
    same connected component
    """
    def traverse_graph(node: QualifiedCol,
                       visited_nodes: Set[QualifiedCol]) -> Set[QualifiedCol]:
        """
        Traverse the whole connected component in with a depth first
        algorithm starting from the node provided.
        """
        if node in visited_nodes:
            return visited_nodes
        visited_nodes.add(node)
        for n in adjacency_sets.get(node, set()):
            visited_nodes = traverse_graph(n, visited_nodes)
        return visited_nodes

    entities_in_join = {
        entity_from_node(node)
        for node in join.get_alias_node_map().values()
    }
    adjacency_sets = join.accept(EquivalenceExtractor(entities_in_join))
    connected_components: MutableMapping[QualifiedCol, Set[QualifiedCol]] = {}

    for node in adjacency_sets:
        if node not in connected_components:
            component = traverse_graph(node, set())
            for node in component:
                equivalent_nodes = copy(component)
                equivalent_nodes.remove(node)
                connected_components[node] = equivalent_nodes

    return connected_components
Esempio n. 4
0
def add_equivalent_conditions(query: CompositeQuery[Entity]) -> None:
    """
    Finds conditions in a join query on columns that have a semantic
    equivalent in another entity in the join and add the same condition
    on the equivalent column.

    Example: In a join between events and groupedmessage, if there is
    a condition on events.project_id, it would replicate the same
    condition on groupedmessage.project_id as this is a semantically
    equivalent column.

    The goal is to reduce the amount of data that is loaded by clickhouse
    for each subquery by adding all the conditions we can to all
    subqueries.

    Cases we skip:
    - top level conditions that include columns in multiple tables.
      These cannot be pushed down to subqueries.
    - top level conditions containing multiple columns as some may
      not have a semantic equivalent. TODO: This can be extended by
      supporting conditions that contain multiple column which all
      have an equivalent in the same entity
    """

    from_clause = query.get_from_clause()
    if isinstance(from_clause, CompositeQuery):
        add_equivalent_conditions(from_clause)
        return
    elif isinstance(from_clause, ProcessableQuery):
        return

    # Now this has to be a join, so we can work with it.

    alias_to_entity = {
        alias: entity_from_node(node)
        for alias, node in from_clause.get_alias_node_map().items()
    }
    entity_to_alias: MutableMapping[EntityKey, Set[str]] = {}
    for alias, entity in alias_to_entity.items():
        entity_to_alias.setdefault(entity, set()).add(alias)

    column_equivalence = get_equivalent_columns(from_clause)
    condition = query.get_condition()
    if condition is None:
        return

    and_components = get_first_level_and_conditions(condition)
    conditions_to_add = []
    for sub_condition in and_components:
        # We duplicate only the top level conditions that reference one
        # and only one column that has a semantic equivalent.
        # This excludes top level conditions that contains columns from
        # multiple entities, and cannot be pushed down to subqueries.
        #
        # TODO: Address top level conditions that contain multiple
        # columns each of which has an equivalent in the same entity.
        sole_column = _classify_single_column_condition(
            sub_condition, alias_to_entity)
        if sole_column is not None:
            column_in_condition, table_alias_in_condition = sole_column

            for equivalent_table_alias in entity_to_alias[
                    column_in_condition.entity]:
                if equivalent_table_alias != table_alias_in_condition:
                    # There are multiple occurrences of the entity found.
                    # Apply the same condition everywhere.
                    replacer = partial(
                        _replace_col,
                        table_alias_in_condition,
                        column_in_condition.column,
                        equivalent_table_alias,
                        column_in_condition.column,
                    )
                    conditions_to_add.append(sub_condition.transform(replacer))

            for equivalent in column_equivalence.get(column_in_condition, []):
                # There are equivalent column on different entities
                # in the query. Transform the condition and add it
                # to all entities.
                equivalent_aliases = entity_to_alias.get(
                    equivalent.entity, set())
                for table_alias in equivalent_aliases:
                    replacer = partial(
                        _replace_col,
                        table_alias_in_condition,
                        column_in_condition.column,
                        table_alias,
                        equivalent.column,
                    )
                    conditions_to_add.append(sub_condition.transform(replacer))

    query.set_ast_condition(
        combine_and_conditions([*and_components, *conditions_to_add]))