def _get_or_created_edge_to(self, node: GraphTraversal, other: int, label: str): return node.coalesce( __.outE(label).filter(__.inV().hasId(other)), __.addE(label).to(self.g.V(other)))
def test_side_effects(self, remote_connection): statics.load_statics(globals()) # g = Graph().traversal().withRemote(remote_connection) ### t = g.V().hasLabel("project").name.iterate() assert 0 == len(t.side_effects.keys()) with pytest.raises(Exception): m = t.side_effects["m"] ### t = g.V().out("created").groupCount("m").by("name") results = t.toSet() assert 2 == len(results) assert Vertex(3) in results assert Vertex(5) in results assert 1 == len(t.side_effects.keys()) assert "m" in t.side_effects.keys() m = t.side_effects["m"] assert isinstance(m, dict) assert 2 == len(m) assert 3 == m["lop"] assert 1 == m["ripple"] assert isinstance(m["lop"], long) assert isinstance(m["ripple"], long) ## t = g.V().out("created").groupCount("m").by("name").name.aggregate("n") results = t.toSet() assert 2 == len(results) assert "lop" in results assert "ripple" in results assert 2 == len(t.side_effects.keys()) assert "m" in t.side_effects.keys() assert "n" in t.side_effects.keys() n = t.side_effects.get("n") assert isinstance(n, dict) assert 2 == len(n) assert "lop" in n.keys() assert "ripple" in n.keys() assert 3 == n["lop"] assert 1 == n["ripple"] t = g.withSideEffect('m', 32).V().map(lambda: "x: x.sideEffects('m')") results = t.toSet() assert 1 == len(results) assert 32 == list(results)[0] assert 32 == t.side_effects['m'] assert 1 == len(t.side_effects.keys()) with pytest.raises(Exception): x = t.side_effects["x"] a = g.V().has("name", "marko").next() b = g.V().has("name", "peter").next() edge = g.withSideEffect("b", b).V(a).addE("knows").to("b").next() assert "knows" == edge.label assert a == edge.outV assert b == edge.inV g.V().has("name", "marko").outE("knows").where(__.inV().has( "name", "peter")).drop().iterate() ## edge = g.withSideEffect("a", a).withSideEffect( "b", b).V().limit(1).addE("knows").from_("a").to("b").next() assert "knows" == edge.label assert a == edge.outV assert b == edge.inV g.V().has("name", "marko").outE("knows").where(__.inV().has( "name", "peter")).drop().iterate()
def test_side_effects(self, remote_connection): statics.load_statics(globals()) # g = traversal().withRemote(remote_connection) ### t = g.V().hasLabel("project").name.iterate() assert 0 == len(t.side_effects.keys()) with pytest.raises(Exception): m = t.side_effects["m"] ### t = g.V().out("created").groupCount("m").by("name") results = t.toSet() assert 2 == len(results) assert Vertex(3) in results assert Vertex(5) in results assert 1 == len(t.side_effects.keys()) assert "m" in t.side_effects.keys() m = t.side_effects["m"] assert isinstance(m, dict) assert 2 == len(m) assert 3 == m["lop"] assert 1 == m["ripple"] assert isinstance(m["lop"], long) assert isinstance(m["ripple"], long) # check status attributes assert "host" in t.side_effects.status_attributes ## t = g.V().out("created").groupCount("m").by("name").name.aggregate("n") results = t.toSet() assert 2 == len(results) assert "lop" in results assert "ripple" in results assert 2 == len(t.side_effects.keys()) assert "m" in t.side_effects.keys() assert "n" in t.side_effects.keys() n = t.side_effects.get("n") assert isinstance(n, dict) assert 2 == len(n) assert "lop" in n.keys() assert "ripple" in n.keys() assert 3 == n["lop"] assert 1 == n["ripple"] t = g.withSideEffect('m', 32).V().map(lambda: "x: x.sideEffects('m')") results = t.toSet() assert 1 == len(results) assert 32 == list(results)[0] assert 32 == t.side_effects['m'] assert 1 == len(t.side_effects.keys()) with pytest.raises(Exception): x = t.side_effects["x"] a = g.V().has("name", "marko").next() b = g.V().has("name", "peter").next() edge = g.withSideEffect("b", b).V(a).addE("knows").to("b").next() assert "knows" == edge.label assert a == edge.outV assert b == edge.inV g.V().has("name", "marko").outE("knows").where(__.inV().has("name", "peter")).drop().iterate() ## edge = g.withSideEffect("a", a).withSideEffect("b", b).V().limit(1).addE("knows").from_("a").to("b").next() assert "knows" == edge.label assert a == edge.outV assert b == edge.inV g.V().has("name", "marko").outE("knows").where(__.inV().has("name", "peter")).drop().iterate()
def set_fields_routing_probs(graph_client: GremlinClient, metrics_client: HeronMetricsClient, topology_id: str, topology_ref: str, start: dt.datetime, end: dt.datetime) -> None: """ Sets the routing probabilities for fields grouped logical connections in physical graph with the supplied topology ID and reference. Routing probabilities are calculated using metrics from the defined time window. Arguments: graph_client (GremlinClient): The client instance for the graph database. metrics_client (HeronMetricsClient): The client instance for metrics database. topology_id (str): The topology identification string. topology_ref (str): The topology reference string. start (dt.datetime): The UTC datetime object for the start of the metrics gathering widow. end (dt.datetime): The UTC datetime object for the end of the metrics gathering widow. """ LOG.info( "Setting fields grouping routing probabilities for topology %s " "reference %s using metrics data from %s to %s", topology_id, topology_ref, start.isoformat(), end.isoformat()) topology_traversal: GraphTraversalSource = \ graph_client.topology_subgraph(topology_id, topology_ref) i_to_i_rps: pd.DataFrame = calculate_inter_instance_rps( metrics_client, topology_id, start, end) # Re-index the DataFrame to make selecting RPs faster i_to_i_rps.set_index(["source_task", "stream", "destination_task"], inplace=True) # Get a list of all fields grouped connections in the physical graph fields_connections: List[Dict[str, Union[int, str, Edge]]] = \ (topology_traversal.V() .outE("logically_connected") .has("grouping", "FIELDS") .project("source_task", "stream", "edge", "destination_task") .by(__.outV().properties("task_id").value()) .by(__.properties("stream").value()) .by() .by(__.inV().properties("task_id").value()) .toList()) LOG.debug( "Processing %d fields grouped connections for topology %s " "reference %s", len(fields_connections), topology_id, topology_ref) connection: Dict[str, Union[int, str, Edge]] for connection in fields_connections: LOG.debug("Processing connection from instance %d to %d on stream %s", connection["source_task"], connection["destination_task"], connection["stream"]) routing_prob: float = (i_to_i_rps.loc[ connection["source_task"], connection["stream"], connection["destination_task"]]["routing_probability"]) (topology_traversal.E(connection["edge"]).property( "routing_probability", routing_prob).next())
def table_entities(cls, *, _g: GraphTraversalSource, table_data: List[Table], existing: EXISTING) -> None: all_tables_ids = list( set([ VertexTypes.Table.value.id( key=TableUris.get(database=t.database, cluster=t.cluster, schema=t.schema, table=t.name).table) for t in table_data ])) all_owner_ids = list( set([ VertexTypes.User.value.id(key=key) for key in [ t.table_writer.id for t in table_data if t.table_writer is not None ] ])) all_application_ids = list( set( list( possible_vertex_ids_for_application_key(*[ t.table_writer.id for t in table_data if t.table_writer is not None ])))) # chunk these since 100,000s seems to choke for tables_ids in chunk(all_tables_ids, 1000): LOGGER.info(f'fetching for tables: {tables_ids}') # fetch database -> cluster -> schema -> table links g = _g.V(tuple(tables_ids)).as_('tables') g = g.coalesce(__.inE( EdgeTypes.Table.value.label).dedup().fold()).as_( EdgeTypes.Table.name) g = g.coalesce(__.unfold().outV().hasLabel( VertexTypes.Schema.value.label).inE( EdgeTypes.Schema.value.label).dedup().fold()).as_( EdgeTypes.Schema.name) g = g.coalesce(__.unfold().outV().hasLabel( VertexTypes.Cluster.value.label).inE( EdgeTypes.Cluster.value.label).dedup().fold()).as_( EdgeTypes.Cluster.name) # fetch table <- links for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates, EdgeTypes.Tag): g = g.coalesce(__.select('tables').inE( t.value.label).fold()).as_(t.name) # fetch table -> column et al links for t in (EdgeTypes.Column, EdgeTypes.Description, EdgeTypes.LastUpdatedAt, EdgeTypes.Source, EdgeTypes.Stat): g = g.coalesce(__.select('tables').outE( t.value.label).fold()).as_(t.name) # TODO: add owners, watermarks, last timestamp existing, source aliases = set([ t.name for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster, EdgeTypes.BelongToTable, EdgeTypes.Generates, EdgeTypes.Tag, EdgeTypes.Column, EdgeTypes.Description, EdgeTypes.LastUpdatedAt, EdgeTypes.Source, EdgeTypes.Stat) ]) g = g.select(*aliases).unfold().select(MapColumn.values).unfold() g = g.local( __.union(__.outV().id(), __.valueMap(True), __.inV().id()).fold()) cls._into_existing(g.toList(), existing) cls._column_entities(_g=_g, tables_ids=tables_ids, existing=existing) # fetch Application, User for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000): LOGGER.info(f'fetching for application/owners: {ids}') g = _g.V(ids).valueMap(True) cls._into_existing(g.toList(), existing)