Example #1
0
def _get_identity_group_hierarchy(g, identity_group_id):
    return (g.V(identity_group_id).project("props", "persistent_ids").by(
        valueMap(True)).by(
            out("member").group().by().by(
                project("props", "transient_ids").by(valueMap(True)).by(
                    out("has_identity").valueMap(True).fold())).select(
                        Column.values)))
Example #2
0
def get_sibling_attrs(g, transient_id):
    """
    Given transient id, get summary of information we have about it or its sibling nodes.

    We gather:
        * node attributes
        * IP / location information
        * IAB categories of visited websites
    """
    return (g.V(transient_id).choose(
        in_("has_identity"),  # check if this transient id has persistent id
        in_("has_identity").project(
            "identity_group_id", "persistent_id",
            "attributes", "ip_location", "iab_categories").by(
                in_("member").values("igid")).by(values("pid")).by(
                    out("has_identity").valueMap().unfold().group().by(
                        Column.keys).by(
                            select(Column.values).unfold().dedup().fold())).
        by(out("has_identity").out("uses").dedup().valueMap().fold()).by(
            out("has_identity").out("visited").in_("links_to").values(
                "categoryCode").dedup().fold()),
        project("identity_group_id", "persistent_id", "attributes",
                "ip_location",
                "iab_categories").by(constant("")).by(constant("")).by(
                    valueMap().unfold().group().by(Column.keys).by(
                        select(Column.values).unfold().dedup().fold())).by(
                            out("uses").dedup().valueMap().fold()).by(
                                out("visited").in_("links_to").values(
                                    "categoryCode").dedup().fold())))
def _get_subgraph(g, website_url, thank_you_page_url, since):
    return (g.V().hasLabel("website").has(
        "url", P.within([
            website_url, thank_you_page_url
        ])).in_("visited").in_("has_identity").dedup().limit(20).project(
            "persistent_id", "transient_ids", "visited_events").by(
                values("pid")).by(out("has_identity").values("uid").fold()).by(
                    out("has_identity").outE("visited").has(
                        "visited_url",
                        P.within([website_url, thank_you_page_url
                                  ])).valueMap("visited_url", "ts",
                                               "uid").dedup().fold()))
def query_users_active_in_given_date_intervals(g, dt_conditions, limit=300):
    """Get users (persistent identities) that interacted with website in given date interval."""

    return (g.V().hasLabel("persistentId").coin(0.5).limit(limit).where(
        out("has_identity").outE("visited").or_(*dt_conditions)).project(
            "persistent_id", "attributes", "ip_location").by(values("pid")).by(
                out("has_identity").valueMap(
                    "browser", "email",
                    "uid").unfold().group().by(Column.keys).by(
                        select(Column.values).unfold().dedup().fold())).by(
                            out("has_identity").out(
                                "uses").dedup().valueMap().fold()))
def query_users_intersted_in_content(g, iab_codes, limit=10000):
    """Get users (persistent identities) that interacted with websites with given iab codes."""

    return (g.V().hasLabel("persistentId").coin(0.8).limit(limit).where(
        out("has_identity").out("visited").in_("links_to").has(
            "categoryCode", P.within(iab_codes))).project(
                "persistent_id", "attributes",
                "ip_location").by(values("pid")).by(
                    out("has_identity").valueMap(
                        "browser", "email",
                        "uid").unfold().group().by(Column.keys).by(
                            select(Column.values).unfold().dedup().fold())).by(
                                out("has_identity").out(
                                    "uses").dedup().valueMap().fold()))
Example #6
0
def get_source_and_sink_comps(graph_client: GremlinClient, topology_id: str,
                              topology_ref: str) -> Dict[str, List[str]]:
    """ Gets a dictionary with "sources" and "sinks" keys linking to lists of component
    names for the sources (instances with no incoming logical edges) and sinks (instances
    with no outgoing logical edges). This method is cached as the logical plan is fixed
    for the lifetime of a topology.

    Arguments:
        graph_client (GremlinClient):   The graph database client instance.
        topology_id (str):  The topology identification string.
        topology_ref (str): The topology graph identification string.

    Returns:
        Dict[str, List[str]]:   A dictionary with "sources" and "sinks" keys linking to
        lists of component names for the sources (instances with no incoming logical
        edges) and sinks (instances with no outgoing logical edges).
    """

    sgt: GraphTraversalSource = graph_client.topology_subgraph(
        topology_id, topology_ref)

    sources: List[str] = sgt.V().where(
        in_("logically_connected").count().is_(0)).values(
            "component").dedup().toList()

    sinks: List[str] = sgt.V().where(
        out("logically_connected").count().is_(0)).values(
            "component").dedup().toList()

    return {"sources": sources, "sinks": sinks}
Example #7
0
def recommend_similar_audience(g,
                               website_url,
                               categories_limit=3,
                               search_time_limit_in_seconds=15):
    """Given website url, categories_limit, categories_coin recommend similar audience in n most popular categories.

    Similar audience - audience of users that at least once visited subpage of domain that contains IAB-category codes
    that are most popular across users of given website
    """
    average_guy = (g.V(website_url).in_("visited").in_(
        "has_identity").dedup().hasLabel("persistentId").group().by().by(
            out("has_identity").out("visited").in_("links_to").groupCount().by(
                "categoryCode")).select(
                    Column.values).unfold().unfold().group().by(
                        Column.keys).by(select(
                            Column.values).mean()).unfold().order().by(
                                Column.values,
                                Order.desc).limit(categories_limit))

    most_popular_categories = dict(
        chain(*category.items()) for category in average_guy.toList())

    guy_stats_subquery = (out("has_identity").out("visited").in_(
        "links_to").groupCount().by("categoryCode").project(
            *most_popular_categories.keys()))

    conditions_subqueries = []
    for i in most_popular_categories:
        guy_stats_subquery = guy_stats_subquery.by(
            choose(select(i), select(i), constant(0)))
        conditions_subqueries.append(
            select(Column.values).unfold().select(i).is_(
                P.gt(int(most_popular_categories[i]))))

    return (g.V().hasLabel("websiteGroup").has(
        "categoryCode", P.within(list(
            most_popular_categories.keys()))).out("links_to").in_("visited").
            dedup().in_("has_identity").dedup().hasLabel("persistentId").where(
                out("has_identity").out("visited").has(
                    "url", P.neq(website_url))).timeLimit(
                        search_time_limit_in_seconds * 1000).local(
                            group().by().by(guy_stats_subquery).where(
                                or_(*conditions_subqueries))).select(
                                    Column.keys).unfold().out(
                                        "has_identity").values("uid"))
Example #8
0
def _query_users_activities_stats(g,
                                  website_url,
                                  most_popular_categories,
                                  search_time_limit_in_seconds=30):
    return (g.V().hasLabel("websiteGroup").has(
        "categoryCode", P.within(list(
            most_popular_categories.keys()))).out("links_to").in_("visited").
            dedup().in_("has_identity").dedup().hasLabel("persistentId").where(
                out("has_identity").out("visited").has(
                    "url", P.neq(website_url))).timeLimit(
                        search_time_limit_in_seconds * 1000).
            local(group().by().by(
                out("has_identity").out("visited").in_(
                    "links_to").groupCount().by("categoryCode")).project(
                        "pid", "iabs",
                        "tids").by(select(Column.keys).unfold()).by(
                            select(Column.values).unfold()).by(
                                select(Column.keys).unfold().out(
                                    "has_identity").values("uid").fold())))
Example #9
0
def _get_subgraph(g, identity_group_id):
    return (g.V(identity_group_id).project("props", "persistent_ids").by(
        valueMap(True)).by(
            out("member").group().by().by(
                project("props", "transient_ids").by(valueMap(True)).by(
                    out("has_identity").group().by().by(
                        project(
                            "props", "ip_location",
                            "random_website_paths").by(valueMap(True)).by(
                                out("uses").valueMap(True).fold()).by(
                                    out("visited").as_("start").in_("links_to")
                                    .as_("end").limit(100).path().by(
                                        valueMap("url")).by(
                                            valueMap(
                                                "url",
                                                "categoryCode")).from_("start")
                                    .to("end").dedup().fold())).select(
                                        Column.values))).select(
                                            Column.values)))
Example #10
0
def _get_categories_popular_across_audience_of_website(g,
                                                       website_url,
                                                       categories_limit=3):
    return (g.V(website_url).in_("visited").in_(
        "has_identity").dedup().hasLabel("persistentId").group().by().by(
            out("has_identity").out("visited").in_("links_to").groupCount().by(
                "categoryCode")).select(
                    Column.values).unfold().unfold().group().by(
                        Column.keys).by(select(
                            Column.values).mean()).unfold().order().by(
                                Column.values,
                                Order.desc).limit(categories_limit))
Example #11
0
def get_component_paths(graph_client: GremlinClient, topology_id: str,
                        topology_ref: str) -> List[List[str]]:
    """ Gets all component level paths through the specified topology. This method is
    cached as the component paths are fixed for the lifetime of a topology.

    Arguments:
        graph_client (GremlinClient):   The graph database client instance.
        topology_id (str):  The topology identification string.
        topology_ref (str): The topology graph identification string.

    Returns:
        List[List[str]]:    A list of component name string path lists. For example
        [["A", "B", "D"], ["A", "C", "D"]
    """

    sources_sinks: Dict[str, List[str]] = get_source_and_sink_comps(
        graph_client, topology_id, topology_ref)

    sgt: GraphTraversalSource = graph_client.topology_subgraph(
        topology_id, topology_ref)

    output: List[List[str]] = []

    for source in sources_sinks["sources"]:
        # Pick a start vertex for this source
        start: Vertex = sgt.V().has("component", source).next()
        for sink in sources_sinks["sinks"]:
            LOG.debug(
                "Finding paths from source component: %s to sink component: %s",
                source,
                sink,
            )
            # Find one path from the source vertex to any sink vertex and emit the
            # components as well as the edges.
            full_path: List[Union[str, Edge]] = (sgt.V(start).repeat(
                out("logically_connected").simplePath()).until(
                    has("component",
                        sink)).path().by("component").by().limit(1).next())

            # Filter out the edges and keep the component strings
            path: List[str] = [
                element for element in full_path if isinstance(element, str)
            ]

            output.append(path)

    return output
Example #12
0
def get_levels(topo_traversal: GraphTraversalSource) -> List[List[Vertex]]:
    """ Gets the levels of the logical graph. The traversal starts with the
    source spouts and performs a breadth first search through the logically
    connected vertices.

    Arguments:
        topo_traversal (GraphTraversalSource):  A traversal source instance
                                                mapped to the topology subgraph
                                                whose levels are to be
                                                calculated.

    Returns:
        A list where each entry is a list of Vertex instances representing a
        level within the logical graph. The first level will be the spout
        instances.
    """

    # Only load the static enums we need so we don't pollute the globals dict
    keys = statics.staticEnums["keys"]
    values = statics.staticEnums["values"]
    local_scope = statics.staticEnums["local"]

    # Repeatedly traverse the tree defined by the logical connections, grouping
    # each group (or set because we us de-duplicate) of vertices by their depth
    # in the tree. This depth is based the current number of times the repeat
    # step has run (loops). So you end up with a map of integer depth to list
    # of vertices which is emitted by the cap step. After this we just put the
    # Hash Map in key order (ascending) and then take only the values (the
    # lists of vertices) and unfold them into a list.
    # The first group by(-1) statement is so that the spout vertices are
    # included at the top of the list
    levels: List[List[Vertex]] = (
        topo_traversal.V().hasLabel("spout").group("m").by(
            constant(-1)).repeat(
                out("logically_connected").dedup().group("m").by(
                    loops())).until(not_(outE("logically_connected"))).
        cap("m").order(local_scope).by(keys).select(values).unfold().toList())

    return levels
def undecided_users_audience(g, website_url, thank_you_page_url, since,
                             min_visited_count):
    """
    Given website url, get all the users that meet audience conditions.

    It returns list of transient identities uids.

    Audience is build from the users that met following criteria:
        * visited some website url at least X times since specific timestamp
        * did not visit thank you page url since specific timestamp
    """
    return (g.V(website_url).hasLabel("website").inE("visited").has(
        "ts", P.gt(since)).outV().in_("has_identity").groupCount().unfold(
        ).dedup().where(select(Column.values).is_(
            P.gt(min_visited_count))).select(Column.keys).as_("pids").map(
                out("has_identity").outE("visited").has(
                    "visited_url",
                    thank_you_page_url).has("ts", P.gt(since)).outV().in_(
                        "has_identity").dedup().values("pid").fold()).as_(
                            "pids_that_visited").select("pids").not_(
                                has("pid",
                                    where(P.within("pids_that_visited")))).out(
                                        "has_identity").values("uid"))
Example #14
0
def get_common_owner(neptune_instance, el_1, el_2):
    '''Returns the lowest common ancestor in the containment tree of the model
    see: http://tinkerpop.apache.org/docs/current/recipes/#_lowest_common_ancestor'''

    return neptune_instance.V(el_1).repeat(out('ownerElement')).emit().as_('x').repeat(__.in_('ownerElement')).emit(hasId(el_2)).select('x').limit(1).toList()
def _get_transient_ids(query, root_url):
    return (query.select("persistent_ids").unfold().group().by("pid").by(
        out("has_identity").outE("visited").
        has(  # do not go through links_to, as it causes neptune memory errors
            "visited_url", P.between(root_url, root_url + "/zzz")).valueMap(
                "uid", "visited_url").dedup().limit(15).fold()))
Example #16
0
def _setup_arrival_calcs(
    metrics_client: HeronMetricsClient, graph_client: GremlinClient,
    topology_id: str, cluster: str, environ: str, topology_ref: str,
    start: dt.datetime, end: dt.datetime, io_bucket_length: int,
    tracker_url: str, **kwargs: Union[str, int, float]
) -> Tuple[pd.DataFrame, List[List[Vertex]], pd.DataFrame, Dict[
        Vertex, List[int]], Dict[Vertex, List[int]]]:
    """ Helper method which sets up the data needed for the arrival rate
    calculations. This is a separate cached method as these data are not
    effected by the traffic (spout_state) and so do not need to be recalculated
    for a new traffic level for the same topology id/ref. """

    topo_traversal: GraphTraversalSource = \
        graph_client.topology_subgraph(topology_id, topology_ref)

    # Calculate the routing probabilities for the defined metric gathering
    # period
    i2i_rps: pd.Series = (calculate_inter_instance_rps(
        metrics_client, topology_id, cluster, environ, start, end, tracker_url,
        **kwargs).set_index(["source_task", "destination_task",
                             "stream"])["routing_probability"])

    # Get the vertex levels for the logical graph tree
    LOG.info("Calculating levels for topology %s reference %s", topology_id,
             topology_ref)
    levels: List[List[Vertex]] = get_levels(topo_traversal)
    LOG.debug("Found %d levels is topology %s reference %s", len(levels),
              topology_id, topology_ref)

    # Calculate the input output ratios for each instances using data from the
    # defined metrics gathering period
    coefficients: pd.Series = lstsq_io_ratios(metrics_client, graph_client,
                                              topology_id, cluster, environ,
                                              start, end, io_bucket_length,
                                              **kwargs).set_index([
                                                  "task", "output_stream",
                                                  "input_stream",
                                                  "source_component"
                                              ])["coefficient"]

    # Get the details of the incoming and outgoing physical connections for
    # stream manager in the topology

    # Get a dictionary mapping from stream manager id string to a list of the
    # instances (within each container) that will send tuples to each stream
    # manager
    sending_instances: Dict[Vertex, List[int]] = \
        (topo_traversal.V().hasLabel("stream_manager")
         .group().by("id").by(in_("physically_connected")
                              .hasLabel(P.within("spout", "bolt"))
                              .values("task_id")
                              .fold())
         .next())

    # Get a dictionary mapping from stream manager id string to a list of the
    # instances (within each container) that will receive tuples from each
    # stream manager
    receiving_instances: Dict[Vertex, List[int]] = \
        (topo_traversal.V().hasLabel("stream_manager")
         .group().by("id").by(out("physically_connected")
                              .hasLabel("bolt").values("task_id").fold())
         .next())

    return (i2i_rps, levels, coefficients, sending_instances,
            receiving_instances)
Example #17
0
def _create_physical_connections(graph_client: GremlinClient, topology_id: str,
                                 topology_ref: str) -> None:

    LOG.info(
        "Creating physical connections for topology: %s, reference: "
        "%s", topology_id, topology_ref)

    topo_traversal: GraphTraversalSource = \
        graph_client.topology_subgraph(topology_id, topology_ref)

    # First get all logically connected pairs of vertex and their associated
    # containers and stream managers
    logical_edges: List[Dict[str, Union[Vertex, Edge]]] = (
        topo_traversal.V().hasLabel(P.within(
            "bolt", "spout")).outE("logically_connected").project(
                "source_instance", "source_container", "source_stream_manager",
                "l_edge", "destination_instance", "destination_container",
                "destination_stream_manager").by(outV()).by(
                    outV().out("is_within")).by(outV().out("is_within").in_(
                        "is_within").hasLabel("stream_manager")).by().by(
                            inV()).by(inV().out("is_within")).by(
                                inV().out("is_within").in_("is_within").
                                hasLabel("stream_manager")).toList())

    LOG.debug("Processing %d logical connected vertices", len(logical_edges))

    for logical_edge in logical_edges:
        source: Vertex = logical_edge["source_instance"]
        source_container: Vertex = logical_edge["source_container"]
        source_stream_manager: Vertex = logical_edge["source_stream_manager"]
        destination: Vertex = logical_edge["destination_instance"]
        destination_container: Vertex = logical_edge["destination_container"]
        destination_stream_manager: Vertex = \
            logical_edge["destination_stream_manager"]
        l_edge: Edge = logical_edge["l_edge"]

        # Connect the source instance to its stream manager, checking first
        # if the connection already exists
        (graph_client.graph_traversal.V(source).coalesce(
            out("physically_connected").is_(source_stream_manager),
            addE("physically_connected").to(source_stream_manager)).next())

        if source_container == destination_container:

            # If the source and destination instances are in the same
            # container then they share the same stream manager so just use
            # the source stream manager found above. Connect the source
            # stream manager to the destination instance

            (graph_client.graph_traversal.V(source_stream_manager).coalesce(
                out("physically_connected").is_(destination),
                addE("physically_connected").to(destination)).next())

            # Set the logical edge for this pair to "local"
            graph_client.graph_traversal.E(l_edge).property("type",
                                                            "local").next()

        else:
            # Connect the two stream managers (if they aren't already)
            (graph_client.graph_traversal.V(source_stream_manager).coalesce(
                out("physically_connected").is_(destination_stream_manager),
                addE("physically_connected").to(
                    destination_stream_manager)).next())

            (graph_client.graph_traversal.V(
                destination_stream_manager).coalesce(
                    out("physically_connected").is_(destination),
                    addE("physically_connected").to(destination)).next())

            # Set the logical edge for this pair to "remote"
            graph_client.graph_traversal.E(l_edge).property("type",
                                                            "remote").next()