def get_sibling_attrs(g, transient_id): """ Given transient id, get summary of information we have about it or its sibling nodes. We gather: * node attributes * IP / location information * IAB categories of visited websites """ return (g.V(transient_id).choose( in_("has_identity"), # check if this transient id has persistent id in_("has_identity").project( "identity_group_id", "persistent_id", "attributes", "ip_location", "iab_categories").by( in_("member").values("igid")).by(values("pid")).by( out("has_identity").valueMap().unfold().group().by( Column.keys).by( select(Column.values).unfold().dedup().fold())). by(out("has_identity").out("uses").dedup().valueMap().fold()).by( out("has_identity").out("visited").in_("links_to").values( "categoryCode").dedup().fold()), project("identity_group_id", "persistent_id", "attributes", "ip_location", "iab_categories").by(constant("")).by(constant("")).by( valueMap().unfold().group().by(Column.keys).by( select(Column.values).unfold().dedup().fold())).by( out("uses").dedup().valueMap().fold()).by( out("visited").in_("links_to").values( "categoryCode").dedup().fold())))
def get_all_devices_from_website_visitors(g, website_id, limit=100): """Get all transient ids (including siblings), that visited given page.""" return (g.V(website_id).project( "transient_ids_no_persistent", "transient_ids_with_siblings").by( in_("visited").limit(limit).fold()).by( in_("visited").in_("has_identity").dedup().out( "has_identity").limit(limit).fold()).select( Column.values).unfold().unfold().dedup())
def get_source_and_sink_comps(graph_client: GremlinClient, topology_id: str, topology_ref: str) -> Dict[str, List[str]]: """ Gets a dictionary with "sources" and "sinks" keys linking to lists of component names for the sources (instances with no incoming logical edges) and sinks (instances with no outgoing logical edges). This method is cached as the logical plan is fixed for the lifetime of a topology. Arguments: graph_client (GremlinClient): The graph database client instance. topology_id (str): The topology identification string. topology_ref (str): The topology graph identification string. Returns: Dict[str, List[str]]: A dictionary with "sources" and "sinks" keys linking to lists of component names for the sources (instances with no incoming logical edges) and sinks (instances with no outgoing logical edges). """ sgt: GraphTraversalSource = graph_client.topology_subgraph( topology_id, topology_ref) sources: List[str] = sgt.V().where( in_("logically_connected").count().is_(0)).values( "component").dedup().toList() sinks: List[str] = sgt.V().where( out("logically_connected").count().is_(0)).values( "component").dedup().toList() return {"sources": sources, "sinks": sinks}
def query_transient_nodes_for_website(g, website_id, limit=10000): return (g.V(website_id).in_("visited").limit(limit).project( "uid", "pid" ).by("uid").by(in_("has_identity").values("pid").fold()).group().by( coalesce( select("pid").unfold(), constant("transient-nodes-connected-to-website"))).by( select("uid").dedup().limit(100).fold()).unfold().project( "persistent-node-id", "transient-nodes").by( select(Column.keys)).by(select(Column.values)).where( select("transient-nodes").unfold().count().is_( P.gt(1)))).toList()
def _setup_arrival_calcs( metrics_client: HeronMetricsClient, graph_client: GremlinClient, topology_id: str, cluster: str, environ: str, topology_ref: str, start: dt.datetime, end: dt.datetime, io_bucket_length: int, tracker_url: str, **kwargs: Union[str, int, float] ) -> Tuple[pd.DataFrame, List[List[Vertex]], pd.DataFrame, Dict[ Vertex, List[int]], Dict[Vertex, List[int]]]: """ Helper method which sets up the data needed for the arrival rate calculations. This is a separate cached method as these data are not effected by the traffic (spout_state) and so do not need to be recalculated for a new traffic level for the same topology id/ref. """ topo_traversal: GraphTraversalSource = \ graph_client.topology_subgraph(topology_id, topology_ref) # Calculate the routing probabilities for the defined metric gathering # period i2i_rps: pd.Series = (calculate_inter_instance_rps( metrics_client, topology_id, cluster, environ, start, end, tracker_url, **kwargs).set_index(["source_task", "destination_task", "stream"])["routing_probability"]) # Get the vertex levels for the logical graph tree LOG.info("Calculating levels for topology %s reference %s", topology_id, topology_ref) levels: List[List[Vertex]] = get_levels(topo_traversal) LOG.debug("Found %d levels is topology %s reference %s", len(levels), topology_id, topology_ref) # Calculate the input output ratios for each instances using data from the # defined metrics gathering period coefficients: pd.Series = lstsq_io_ratios(metrics_client, graph_client, topology_id, cluster, environ, start, end, io_bucket_length, **kwargs).set_index([ "task", "output_stream", "input_stream", "source_component" ])["coefficient"] # Get the details of the incoming and outgoing physical connections for # stream manager in the topology # Get a dictionary mapping from stream manager id string to a list of the # instances (within each container) that will send tuples to each stream # manager sending_instances: Dict[Vertex, List[int]] = \ (topo_traversal.V().hasLabel("stream_manager") .group().by("id").by(in_("physically_connected") .hasLabel(P.within("spout", "bolt")) .values("task_id") .fold()) .next()) # Get a dictionary mapping from stream manager id string to a list of the # instances (within each container) that will receive tuples from each # stream manager receiving_instances: Dict[Vertex, List[int]] = \ (topo_traversal.V().hasLabel("stream_manager") .group().by("id").by(out("physically_connected") .hasLabel("bolt").values("task_id").fold()) .next()) return (i2i_rps, levels, coefficients, sending_instances, receiving_instances)