Example #1
0
def get_sibling_attrs(g, transient_id):
    """
    Given transient id, get summary of information we have about it or its sibling nodes.

    We gather:
        * node attributes
        * IP / location information
        * IAB categories of visited websites
    """
    return (g.V(transient_id).choose(
        in_("has_identity"),  # check if this transient id has persistent id
        in_("has_identity").project(
            "identity_group_id", "persistent_id",
            "attributes", "ip_location", "iab_categories").by(
                in_("member").values("igid")).by(values("pid")).by(
                    out("has_identity").valueMap().unfold().group().by(
                        Column.keys).by(
                            select(Column.values).unfold().dedup().fold())).
        by(out("has_identity").out("uses").dedup().valueMap().fold()).by(
            out("has_identity").out("visited").in_("links_to").values(
                "categoryCode").dedup().fold()),
        project("identity_group_id", "persistent_id", "attributes",
                "ip_location",
                "iab_categories").by(constant("")).by(constant("")).by(
                    valueMap().unfold().group().by(Column.keys).by(
                        select(Column.values).unfold().dedup().fold())).by(
                            out("uses").dedup().valueMap().fold()).by(
                                out("visited").in_("links_to").values(
                                    "categoryCode").dedup().fold())))
def get_all_devices_from_website_visitors(g, website_id, limit=100):
    """Get all transient ids (including siblings), that visited given page."""

    return (g.V(website_id).project(
        "transient_ids_no_persistent", "transient_ids_with_siblings").by(
            in_("visited").limit(limit).fold()).by(
                in_("visited").in_("has_identity").dedup().out(
                    "has_identity").limit(limit).fold()).select(
                        Column.values).unfold().unfold().dedup())
Example #3
0
def get_source_and_sink_comps(graph_client: GremlinClient, topology_id: str,
                              topology_ref: str) -> Dict[str, List[str]]:
    """ Gets a dictionary with "sources" and "sinks" keys linking to lists of component
    names for the sources (instances with no incoming logical edges) and sinks (instances
    with no outgoing logical edges). This method is cached as the logical plan is fixed
    for the lifetime of a topology.

    Arguments:
        graph_client (GremlinClient):   The graph database client instance.
        topology_id (str):  The topology identification string.
        topology_ref (str): The topology graph identification string.

    Returns:
        Dict[str, List[str]]:   A dictionary with "sources" and "sinks" keys linking to
        lists of component names for the sources (instances with no incoming logical
        edges) and sinks (instances with no outgoing logical edges).
    """

    sgt: GraphTraversalSource = graph_client.topology_subgraph(
        topology_id, topology_ref)

    sources: List[str] = sgt.V().where(
        in_("logically_connected").count().is_(0)).values(
            "component").dedup().toList()

    sinks: List[str] = sgt.V().where(
        out("logically_connected").count().is_(0)).values(
            "component").dedup().toList()

    return {"sources": sources, "sinks": sinks}
Example #4
0
def query_transient_nodes_for_website(g, website_id, limit=10000):
    return (g.V(website_id).in_("visited").limit(limit).project(
        "uid", "pid"
    ).by("uid").by(in_("has_identity").values("pid").fold()).group().by(
        coalesce(
            select("pid").unfold(),
            constant("transient-nodes-connected-to-website"))).by(
                select("uid").dedup().limit(100).fold()).unfold().project(
                    "persistent-node-id", "transient-nodes").by(
                        select(Column.keys)).by(select(Column.values)).where(
                            select("transient-nodes").unfold().count().is_(
                                P.gt(1)))).toList()
Example #5
0
def _setup_arrival_calcs(
    metrics_client: HeronMetricsClient, graph_client: GremlinClient,
    topology_id: str, cluster: str, environ: str, topology_ref: str,
    start: dt.datetime, end: dt.datetime, io_bucket_length: int,
    tracker_url: str, **kwargs: Union[str, int, float]
) -> Tuple[pd.DataFrame, List[List[Vertex]], pd.DataFrame, Dict[
        Vertex, List[int]], Dict[Vertex, List[int]]]:
    """ Helper method which sets up the data needed for the arrival rate
    calculations. This is a separate cached method as these data are not
    effected by the traffic (spout_state) and so do not need to be recalculated
    for a new traffic level for the same topology id/ref. """

    topo_traversal: GraphTraversalSource = \
        graph_client.topology_subgraph(topology_id, topology_ref)

    # Calculate the routing probabilities for the defined metric gathering
    # period
    i2i_rps: pd.Series = (calculate_inter_instance_rps(
        metrics_client, topology_id, cluster, environ, start, end, tracker_url,
        **kwargs).set_index(["source_task", "destination_task",
                             "stream"])["routing_probability"])

    # Get the vertex levels for the logical graph tree
    LOG.info("Calculating levels for topology %s reference %s", topology_id,
             topology_ref)
    levels: List[List[Vertex]] = get_levels(topo_traversal)
    LOG.debug("Found %d levels is topology %s reference %s", len(levels),
              topology_id, topology_ref)

    # Calculate the input output ratios for each instances using data from the
    # defined metrics gathering period
    coefficients: pd.Series = lstsq_io_ratios(metrics_client, graph_client,
                                              topology_id, cluster, environ,
                                              start, end, io_bucket_length,
                                              **kwargs).set_index([
                                                  "task", "output_stream",
                                                  "input_stream",
                                                  "source_component"
                                              ])["coefficient"]

    # Get the details of the incoming and outgoing physical connections for
    # stream manager in the topology

    # Get a dictionary mapping from stream manager id string to a list of the
    # instances (within each container) that will send tuples to each stream
    # manager
    sending_instances: Dict[Vertex, List[int]] = \
        (topo_traversal.V().hasLabel("stream_manager")
         .group().by("id").by(in_("physically_connected")
                              .hasLabel(P.within("spout", "bolt"))
                              .values("task_id")
                              .fold())
         .next())

    # Get a dictionary mapping from stream manager id string to a list of the
    # instances (within each container) that will receive tuples from each
    # stream manager
    receiving_instances: Dict[Vertex, List[int]] = \
        (topo_traversal.V().hasLabel("stream_manager")
         .group().by("id").by(out("physically_connected")
                              .hasLabel("bolt").values("task_id").fold())
         .next())

    return (i2i_rps, levels, coefficients, sending_instances,
            receiving_instances)