Exemple #1
0
    def get_node_predecessors(
        self, u: Hashable, include_metadata: bool = False
    ) -> Collection:
        """
        Get a generator of all downstream nodes from this node.

        Arguments:
            u (Hashable): The source node ID

        Returns:
            Generator

        """
        if include_metadata:
            return {
                e["source"]: e
                for e in (
                    self._g.V()
                    .has(ID, u)
                    .inE()
                    .project("target", "source", "properties")
                    .by(__.inV().values(ID))
                    .by(__.outV().values(ID))
                    .by(__.valueMap(True))
                    .toList()
                )
            }
        return self._g.V().out().has(ID, u).values(ID).toList()
Exemple #2
0
    def all_edges_as_iterable(self, include_metadata: bool = False) -> Collection:
        """
        Get a list of all edges in this graph, arbitrary sort.

        Arguments:
            include_metadata (bool: False): Whether to include edge metadata

        Returns:
            Generator: A generator of all edges (arbitrary sort)

        """
        if include_metadata:
            return iter(
                [
                    (e["source"], e["target"], _node_to_metadata(e["properties"]))
                    for e in (
                        self._g.V()
                        .outE()
                        .project("target", "source", "properties")
                        .by(__.inV().values(ID))
                        .by(__.outV().values(ID))
                        .by(__.valueMap(True))
                        .toList()
                    )
                ]
            )
        return iter(
            [
                (e["source"], e["target"])
                for e in self._g.V()
                .outE()
                .project("target", "source")
                .by(__.inV().values(ID))
                .by(__.outV().values(ID))
                .toList()
            ]
        )
    def _column_entities(cls, *, _g: GraphTraversalSource,
                         tables_ids: Iterable[str],
                         existing: EXISTING) -> None:
        # fetch database -> cluster -> schema -> table links
        g = _g.V(tuple(tables_ids))
        g = g.outE(EdgeTypes.Column.value.label)
        g = g.inV().hasLabel(VertexTypes.Column.value.label).as_('columns')

        # fetch column -> links (no Stat)
        for t in [EdgeTypes.Description]:
            g = g.coalesce(__.select('columns').outE(
                t.value.label).fold()).as_(t.name)

        g = g.select(EdgeTypes.Description.name).unfold()
        g = g.local(
            __.union(__.outV().id(), __.valueMap(True),
                     __.inV().id()).fold())
        cls._into_existing(g.toList(), existing)
    def expire_connections_for_other(cls, *, _g: GraphTraversalSource,
                                     vertex_type: VertexType,
                                     keys: FrozenSet[str],
                                     existing: EXISTING) -> None:
        # V().has(label, 'key', P.without(keys)) is more intuitive but doesn't scale, so instead just find all those
        g = _g.V().hasLabel(vertex_type.label).where(__.bothE())
        g = g.values(WellKnownProperties.Key.value.name)
        all_to_expire_keys = set(g.toList()).difference(keys)

        # TODO: when any vertex ids that need something besides key
        all_to_expire = set(
            vertex_type.id(key=key) for key in all_to_expire_keys)

        for to_expire in chunk(all_to_expire, 1000):
            g = _g.V(tuple(to_expire)).bothE()
            g = g.local(
                __.union(__.outV().id(), __.valueMap(True),
                         __.inV().id()).fold())
            cls._into_existing(g.toList(), existing)
Exemple #5
0
    def get_links(self, entity):
        """
        Returns the list of edges of the vertex, both
        inbound and outbound
            :param entity: the entity id of a vertex
        """

        # find the node identified by `entity`
        traversal = self.g.V().has('entity', entity)

        # traverse both inbound and outbound edges
        traversal = traversal.bothE()

        # select attributes on edges
        traversal = traversal \
            .as_('source', 'target', 'date_operation', 'valeur_euro') \
            .select('source', 'target', 'date_operation', 'valeur_euro')\
            .by(__.outV().values('entity'))\
            .by(__.inV().values('entity'))\
            .by('date_operation')\
            .by('valeur_euro')

        links = traversal.toList()
        return links
Exemple #6
0
 def get_or_create_edge(self, v1, v2, label: str):
     return self.g.V(v1).as_('v1').V(v2).coalesce(
         __.inE(label).where(__.outV().as_('v1')),
         __.addE(label).from_('v1'))
 def _get_or_created_edge_from(self, node: GraphTraversal, other: int,
                               label: str):
     return node.coalesce(
         __.inE(label).filter(__.outV().hasId(other)),
         __.addE(label).from_(self.g.V(other)))
Exemple #8
0
def set_fields_routing_probs(graph_client: GremlinClient,
                             metrics_client: HeronMetricsClient,
                             topology_id: str, topology_ref: str,
                             start: dt.datetime, end: dt.datetime) -> None:
    """ Sets the routing probabilities for fields grouped logical connections
    in physical graph with the supplied topology ID and reference. Routing
    probabilities are calculated using metrics from the defined time window.

    Arguments:
        graph_client (GremlinClient):   The client instance for the graph
                                        database.
        metrics_client (HeronMetricsClient): The client instance for metrics
                                             database.
        topology_id (str):  The topology identification string.
        topology_ref (str): The topology reference string.
        start (dt.datetime):    The UTC datetime object for the start of the
                                metrics gathering widow.
        end (dt.datetime):  The UTC datetime object for the end of the metrics
                            gathering widow.
    """

    LOG.info(
        "Setting fields grouping routing probabilities for topology %s "
        "reference %s using metrics data from %s to %s", topology_id,
        topology_ref, start.isoformat(), end.isoformat())

    topology_traversal: GraphTraversalSource = \
        graph_client.topology_subgraph(topology_id, topology_ref)

    i_to_i_rps: pd.DataFrame = calculate_inter_instance_rps(
        metrics_client, topology_id, start, end)

    # Re-index the DataFrame to make selecting RPs faster
    i_to_i_rps.set_index(["source_task", "stream", "destination_task"],
                         inplace=True)

    # Get a list of all fields grouped connections in the physical graph
    fields_connections: List[Dict[str, Union[int, str, Edge]]] = \
        (topology_traversal.V()
         .outE("logically_connected")
         .has("grouping", "FIELDS")
         .project("source_task", "stream", "edge", "destination_task")
         .by(__.outV().properties("task_id").value())
         .by(__.properties("stream").value())
         .by()
         .by(__.inV().properties("task_id").value())
         .toList())

    LOG.debug(
        "Processing %d fields grouped connections for topology %s "
        "reference %s", len(fields_connections), topology_id, topology_ref)

    connection: Dict[str, Union[int, str, Edge]]
    for connection in fields_connections:

        LOG.debug("Processing connection from instance %d to %d on stream %s",
                  connection["source_task"], connection["destination_task"],
                  connection["stream"])

        routing_prob: float = (i_to_i_rps.loc[
            connection["source_task"], connection["stream"],
            connection["destination_task"]]["routing_probability"])

        (topology_traversal.E(connection["edge"]).property(
            "routing_probability", routing_prob).next())
    def table_entities(cls, *, _g: GraphTraversalSource,
                       table_data: List[Table], existing: EXISTING) -> None:

        all_tables_ids = list(
            set([
                VertexTypes.Table.value.id(
                    key=TableUris.get(database=t.database,
                                      cluster=t.cluster,
                                      schema=t.schema,
                                      table=t.name).table) for t in table_data
            ]))

        all_owner_ids = list(
            set([
                VertexTypes.User.value.id(key=key) for key in [
                    t.table_writer.id for t in table_data
                    if t.table_writer is not None
                ]
            ]))
        all_application_ids = list(
            set(
                list(
                    possible_vertex_ids_for_application_key(*[
                        t.table_writer.id for t in table_data
                        if t.table_writer is not None
                    ]))))

        # chunk these since 100,000s seems to choke
        for tables_ids in chunk(all_tables_ids, 1000):
            LOGGER.info(f'fetching for tables: {tables_ids}')
            # fetch database -> cluster -> schema -> table links
            g = _g.V(tuple(tables_ids)).as_('tables')
            g = g.coalesce(__.inE(
                EdgeTypes.Table.value.label).dedup().fold()).as_(
                    EdgeTypes.Table.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Schema.value.label).inE(
                    EdgeTypes.Schema.value.label).dedup().fold()).as_(
                        EdgeTypes.Schema.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Cluster.value.label).inE(
                    EdgeTypes.Cluster.value.label).dedup().fold()).as_(
                        EdgeTypes.Cluster.name)

            # fetch table <- links
            for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates,
                      EdgeTypes.Tag):
                g = g.coalesce(__.select('tables').inE(
                    t.value.label).fold()).as_(t.name)

            # fetch table -> column et al links
            for t in (EdgeTypes.Column, EdgeTypes.Description,
                      EdgeTypes.LastUpdatedAt, EdgeTypes.Source,
                      EdgeTypes.Stat):
                g = g.coalesce(__.select('tables').outE(
                    t.value.label).fold()).as_(t.name)

            # TODO: add owners, watermarks, last timestamp existing, source
            aliases = set([
                t.name
                for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster,
                          EdgeTypes.BelongToTable, EdgeTypes.Generates,
                          EdgeTypes.Tag, EdgeTypes.Column,
                          EdgeTypes.Description, EdgeTypes.LastUpdatedAt,
                          EdgeTypes.Source, EdgeTypes.Stat)
            ])
            g = g.select(*aliases).unfold().select(MapColumn.values).unfold()
            g = g.local(
                __.union(__.outV().id(), __.valueMap(True),
                         __.inV().id()).fold())
            cls._into_existing(g.toList(), existing)

            cls._column_entities(_g=_g,
                                 tables_ids=tables_ids,
                                 existing=existing)

        # fetch Application, User
        for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000):
            LOGGER.info(f'fetching for application/owners: {ids}')
            g = _g.V(ids).valueMap(True)
            cls._into_existing(g.toList(), existing)