def _get_or_created_edge_to(self, node: GraphTraversal, other: int,
                             label: str):
     return node.coalesce(
         __.outE(label).filter(__.inV().hasId(other)),
         __.addE(label).to(self.g.V(other)))
    def test_side_effects(self, remote_connection):
        statics.load_statics(globals())
        #
        g = Graph().traversal().withRemote(remote_connection)
        ###
        t = g.V().hasLabel("project").name.iterate()
        assert 0 == len(t.side_effects.keys())
        with pytest.raises(Exception):
            m = t.side_effects["m"]
        ###
        t = g.V().out("created").groupCount("m").by("name")
        results = t.toSet()
        assert 2 == len(results)
        assert Vertex(3) in results
        assert Vertex(5) in results
        assert 1 == len(t.side_effects.keys())
        assert "m" in t.side_effects.keys()
        m = t.side_effects["m"]
        assert isinstance(m, dict)
        assert 2 == len(m)
        assert 3 == m["lop"]
        assert 1 == m["ripple"]
        assert isinstance(m["lop"], long)
        assert isinstance(m["ripple"], long)
        ##
        t = g.V().out("created").groupCount("m").by("name").name.aggregate("n")
        results = t.toSet()
        assert 2 == len(results)
        assert "lop" in results
        assert "ripple" in results
        assert 2 == len(t.side_effects.keys())
        assert "m" in t.side_effects.keys()
        assert "n" in t.side_effects.keys()
        n = t.side_effects.get("n")
        assert isinstance(n, dict)
        assert 2 == len(n)
        assert "lop" in n.keys()
        assert "ripple" in n.keys()
        assert 3 == n["lop"]
        assert 1 == n["ripple"]

        t = g.withSideEffect('m', 32).V().map(lambda: "x: x.sideEffects('m')")
        results = t.toSet()
        assert 1 == len(results)
        assert 32 == list(results)[0]
        assert 32 == t.side_effects['m']
        assert 1 == len(t.side_effects.keys())
        with pytest.raises(Exception):
            x = t.side_effects["x"]

        a = g.V().has("name", "marko").next()
        b = g.V().has("name", "peter").next()
        edge = g.withSideEffect("b", b).V(a).addE("knows").to("b").next()
        assert "knows" == edge.label
        assert a == edge.outV
        assert b == edge.inV
        g.V().has("name", "marko").outE("knows").where(__.inV().has(
            "name", "peter")).drop().iterate()
        ##
        edge = g.withSideEffect("a", a).withSideEffect(
            "b", b).V().limit(1).addE("knows").from_("a").to("b").next()
        assert "knows" == edge.label
        assert a == edge.outV
        assert b == edge.inV
        g.V().has("name", "marko").outE("knows").where(__.inV().has(
            "name", "peter")).drop().iterate()
    def test_side_effects(self, remote_connection):
        statics.load_statics(globals())
        #
        g = traversal().withRemote(remote_connection)
        ###
        t = g.V().hasLabel("project").name.iterate()
        assert 0 == len(t.side_effects.keys())
        with pytest.raises(Exception):
            m = t.side_effects["m"]
        ###
        t = g.V().out("created").groupCount("m").by("name")
        results = t.toSet()
        assert 2 == len(results)
        assert Vertex(3) in results
        assert Vertex(5) in results
        assert 1 == len(t.side_effects.keys())
        assert "m" in t.side_effects.keys()
        m = t.side_effects["m"]
        assert isinstance(m, dict)
        assert 2 == len(m)
        assert 3 == m["lop"]
        assert 1 == m["ripple"]
        assert isinstance(m["lop"], long)
        assert isinstance(m["ripple"], long)

        # check status attributes
        assert "host" in t.side_effects.status_attributes

        ##
        t = g.V().out("created").groupCount("m").by("name").name.aggregate("n")
        results = t.toSet()
        assert 2 == len(results)
        assert "lop" in results
        assert "ripple" in results
        assert 2 == len(t.side_effects.keys())
        assert "m" in t.side_effects.keys()
        assert "n" in t.side_effects.keys()
        n = t.side_effects.get("n")
        assert isinstance(n, dict)
        assert 2 == len(n)
        assert "lop" in n.keys()
        assert "ripple" in n.keys()
        assert 3 == n["lop"]
        assert 1 == n["ripple"]

        t = g.withSideEffect('m', 32).V().map(lambda: "x: x.sideEffects('m')")
        results = t.toSet()
        assert 1 == len(results)
        assert 32 == list(results)[0]
        assert 32 == t.side_effects['m']
        assert 1 == len(t.side_effects.keys())
        with pytest.raises(Exception):
            x = t.side_effects["x"]

        a = g.V().has("name", "marko").next()
        b = g.V().has("name", "peter").next()
        edge = g.withSideEffect("b", b).V(a).addE("knows").to("b").next()
        assert "knows" == edge.label
        assert a == edge.outV
        assert b == edge.inV
        g.V().has("name", "marko").outE("knows").where(__.inV().has("name", "peter")).drop().iterate()
        ##
        edge = g.withSideEffect("a", a).withSideEffect("b", b).V().limit(1).addE("knows").from_("a").to("b").next()
        assert "knows" == edge.label
        assert a == edge.outV
        assert b == edge.inV
        g.V().has("name", "marko").outE("knows").where(__.inV().has("name", "peter")).drop().iterate()
Beispiel #4
0
def set_fields_routing_probs(graph_client: GremlinClient,
                             metrics_client: HeronMetricsClient,
                             topology_id: str, topology_ref: str,
                             start: dt.datetime, end: dt.datetime) -> None:
    """ Sets the routing probabilities for fields grouped logical connections
    in physical graph with the supplied topology ID and reference. Routing
    probabilities are calculated using metrics from the defined time window.

    Arguments:
        graph_client (GremlinClient):   The client instance for the graph
                                        database.
        metrics_client (HeronMetricsClient): The client instance for metrics
                                             database.
        topology_id (str):  The topology identification string.
        topology_ref (str): The topology reference string.
        start (dt.datetime):    The UTC datetime object for the start of the
                                metrics gathering widow.
        end (dt.datetime):  The UTC datetime object for the end of the metrics
                            gathering widow.
    """

    LOG.info(
        "Setting fields grouping routing probabilities for topology %s "
        "reference %s using metrics data from %s to %s", topology_id,
        topology_ref, start.isoformat(), end.isoformat())

    topology_traversal: GraphTraversalSource = \
        graph_client.topology_subgraph(topology_id, topology_ref)

    i_to_i_rps: pd.DataFrame = calculate_inter_instance_rps(
        metrics_client, topology_id, start, end)

    # Re-index the DataFrame to make selecting RPs faster
    i_to_i_rps.set_index(["source_task", "stream", "destination_task"],
                         inplace=True)

    # Get a list of all fields grouped connections in the physical graph
    fields_connections: List[Dict[str, Union[int, str, Edge]]] = \
        (topology_traversal.V()
         .outE("logically_connected")
         .has("grouping", "FIELDS")
         .project("source_task", "stream", "edge", "destination_task")
         .by(__.outV().properties("task_id").value())
         .by(__.properties("stream").value())
         .by()
         .by(__.inV().properties("task_id").value())
         .toList())

    LOG.debug(
        "Processing %d fields grouped connections for topology %s "
        "reference %s", len(fields_connections), topology_id, topology_ref)

    connection: Dict[str, Union[int, str, Edge]]
    for connection in fields_connections:

        LOG.debug("Processing connection from instance %d to %d on stream %s",
                  connection["source_task"], connection["destination_task"],
                  connection["stream"])

        routing_prob: float = (i_to_i_rps.loc[
            connection["source_task"], connection["stream"],
            connection["destination_task"]]["routing_probability"])

        (topology_traversal.E(connection["edge"]).property(
            "routing_probability", routing_prob).next())
    def table_entities(cls, *, _g: GraphTraversalSource,
                       table_data: List[Table], existing: EXISTING) -> None:

        all_tables_ids = list(
            set([
                VertexTypes.Table.value.id(
                    key=TableUris.get(database=t.database,
                                      cluster=t.cluster,
                                      schema=t.schema,
                                      table=t.name).table) for t in table_data
            ]))

        all_owner_ids = list(
            set([
                VertexTypes.User.value.id(key=key) for key in [
                    t.table_writer.id for t in table_data
                    if t.table_writer is not None
                ]
            ]))
        all_application_ids = list(
            set(
                list(
                    possible_vertex_ids_for_application_key(*[
                        t.table_writer.id for t in table_data
                        if t.table_writer is not None
                    ]))))

        # chunk these since 100,000s seems to choke
        for tables_ids in chunk(all_tables_ids, 1000):
            LOGGER.info(f'fetching for tables: {tables_ids}')
            # fetch database -> cluster -> schema -> table links
            g = _g.V(tuple(tables_ids)).as_('tables')
            g = g.coalesce(__.inE(
                EdgeTypes.Table.value.label).dedup().fold()).as_(
                    EdgeTypes.Table.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Schema.value.label).inE(
                    EdgeTypes.Schema.value.label).dedup().fold()).as_(
                        EdgeTypes.Schema.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Cluster.value.label).inE(
                    EdgeTypes.Cluster.value.label).dedup().fold()).as_(
                        EdgeTypes.Cluster.name)

            # fetch table <- links
            for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates,
                      EdgeTypes.Tag):
                g = g.coalesce(__.select('tables').inE(
                    t.value.label).fold()).as_(t.name)

            # fetch table -> column et al links
            for t in (EdgeTypes.Column, EdgeTypes.Description,
                      EdgeTypes.LastUpdatedAt, EdgeTypes.Source,
                      EdgeTypes.Stat):
                g = g.coalesce(__.select('tables').outE(
                    t.value.label).fold()).as_(t.name)

            # TODO: add owners, watermarks, last timestamp existing, source
            aliases = set([
                t.name
                for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster,
                          EdgeTypes.BelongToTable, EdgeTypes.Generates,
                          EdgeTypes.Tag, EdgeTypes.Column,
                          EdgeTypes.Description, EdgeTypes.LastUpdatedAt,
                          EdgeTypes.Source, EdgeTypes.Stat)
            ])
            g = g.select(*aliases).unfold().select(MapColumn.values).unfold()
            g = g.local(
                __.union(__.outV().id(), __.valueMap(True),
                         __.inV().id()).fold())
            cls._into_existing(g.toList(), existing)

            cls._column_entities(_g=_g,
                                 tables_ids=tables_ids,
                                 existing=existing)

        # fetch Application, User
        for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000):
            LOGGER.info(f'fetching for application/owners: {ids}')
            g = _g.V(ids).valueMap(True)
            cls._into_existing(g.toList(), existing)