Ejemplo n.º 1
0
def get_sibling_attrs(g, transient_id):
    """
    Given transient id, get summary of information we have about it or its sibling nodes.

    We gather:
        * node attributes
        * IP / location information
        * IAB categories of visited websites
    """
    return (g.V(transient_id).choose(
        in_("has_identity"),  # check if this transient id has persistent id
        in_("has_identity").project(
            "identity_group_id", "persistent_id",
            "attributes", "ip_location", "iab_categories").by(
                in_("member").values("igid")).by(values("pid")).by(
                    out("has_identity").valueMap().unfold().group().by(
                        Column.keys).by(
                            select(Column.values).unfold().dedup().fold())).
        by(out("has_identity").out("uses").dedup().valueMap().fold()).by(
            out("has_identity").out("visited").in_("links_to").values(
                "categoryCode").dedup().fold()),
        project("identity_group_id", "persistent_id", "attributes",
                "ip_location",
                "iab_categories").by(constant("")).by(constant("")).by(
                    valueMap().unfold().group().by(Column.keys).by(
                        select(Column.values).unfold().dedup().fold())).by(
                            out("uses").dedup().valueMap().fold()).by(
                                out("visited").in_("links_to").values(
                                    "categoryCode").dedup().fold())))
def undecided_user_audience_check(g, transient_id, website_url,
                                  thank_you_page_url, since,
                                  min_visited_count):
    """
    Given transient id, check whether it belongs to an audience.

    It's simple yes, no question.

    User belongs to an audience whenever all of the following criteria are met:
        * visited some website url at least X times since specific timestamp
        * did not visit thank you page url since specific timestamp
    """
    return (g.V(transient_id).hasLabel("transientId").in_("has_identity").out(
        "has_identity").outE("visited").has("ts", P.gt(since)).choose(
            has("visited_url", website_url),
            groupCount("visits").by(constant("page_visits"))).choose(
                has("visited_url", thank_you_page_url),
                groupCount("visits").by(
                    constant("thank_you_page_vists"))).cap("visits").coalesce(
                        and_(
                            coalesce(select("thank_you_page_vists"),
                                     constant(0)).is_(0),
                            select("page_visits").is_(
                                P.gt(min_visited_count))).choose(
                                    count().is_(1), constant(True)),
                        constant(False)))
Ejemplo n.º 3
0
def get_activity_of_early_adopters(g,
                                   thank_you_page_url,
                                   skip_single_transients=False,
                                   limit=5):
    """
    Given thank you page url, find first early adopters of the product.

    In other words:
        * find first few persistent identities (or transient if they're not matched with any user)
          that visited given thank you page
        * extract their *whole* activity on the domain of the thank_you_page
    """
    return (g.V(thank_you_page_url).hasLabel("website").as_("thank_you").in_(
        "links_to").as_("website_group").select("thank_you").inE(
            "visited").order().by("ts").choose(
                constant(skip_single_transients).is_(P.eq(True)),
                where(outV().in_("has_identity")), identity()).choose(
                    outV().in_("has_identity"),
                    project("type", "id",
                            "purchase_ts").by(constant("persistent")).by(
                                outV().in_("has_identity")).by(values("ts")),
                    project("type", "id", "purchase_ts").by(
                        constant("transient")).by(outV()).by(values("ts"))).
            dedup("id").limit(limit).choose(
                select("type").is_("persistent"),
                project("persistent_id", "transient_id",
                        "purchase_ts").by(select("id").values("pid")).by(
                            select("id").out("has_identity").fold()).by(
                                select("purchase_ts")),
                project("persistent_id", "transient_id", "purchase_ts").by(
                    constant("")).by(select("id").fold()).by(
                        select("purchase_ts"))).project(
                            "persistent_id", "purchase_ts", "devices",
                            "visits").by(select("persistent_id")).by(
                                select("purchase_ts")).by(
                                    select("transient_id").unfold().group().by(
                                        values("uid")).by(values("type"))).
            by(
                select("transient_id").unfold().outE("visited").order().by(
                    "ts").where(inV().in_("links_to").where(
                        P.eq("website_group"))).project(
                            "transientId", "url",
                            "ts").by("uid").by("visited_url").by("ts").fold()))
Ejemplo n.º 4
0
def query_transient_nodes_for_website(g, website_id, limit=10000):
    return (g.V(website_id).in_("visited").limit(limit).project(
        "uid", "pid"
    ).by("uid").by(in_("has_identity").values("pid").fold()).group().by(
        coalesce(
            select("pid").unfold(),
            constant("transient-nodes-connected-to-website"))).by(
                select("uid").dedup().limit(100).fold()).unfold().project(
                    "persistent-node-id", "transient-nodes").by(
                        select(Column.keys)).by(select(Column.values)).where(
                            select("transient-nodes").unfold().count().is_(
                                P.gt(1)))).toList()
Ejemplo n.º 5
0
 def wrapper(*args):
     t = fun(*args)
     r = t.map(
         union(label(), id(), coalesce(values('fq_name'),
                                       constant(''))).fold()).toList()
     # convert gremlin result in [Resource]
     resources = []
     for r_ in r:
         res_type = r_[0].replace('_', '-')
         uuid = text_type(r_[1])
         fq_name = r_[2]
         resources.append(Resource(res_type, uuid=uuid, fq_name=fq_name))
     return resources
Ejemplo n.º 6
0
    def get_runtime_environment(self,
                                runtime_environment_name: str,
                                analysis_document_id: str = None) -> tuple:
        """Get runtime environment dependencies by its name.

        Select the newest analysis if no document id is present.
        """
        loop = asyncio.get_event_loop()

        if not analysis_document_id:
            analysis_document_id = loop.run_until_complete(self.g.V().has(
                '__label__',
                RuntimeEnvironment.__label__).has('__type__', 'vertex').has(
                    'runtime_environment_name',
                    runtime_environment_name).inE().has(
                        '__label__', IsPartOf.__label__).order().by(
                            'analysis_datetime',
                            Order.decr).range(0, 1).valueMap().select(
                                'analysis_document_id').next())

            if not analysis_document_id:
                raise NotFoundError(
                    f"No entries for runtime environment {runtime_environment_name!r} found"
                )

        query = self.g.V() \
            .has('__label__', RuntimeEnvironment.__label__) \
            .has('__type__', 'vertex') \
            .has('runtime_environment_name', runtime_environment_name) \
            .coalesce(inE()
                      .has('__label__', IsPartOf.__label__)
                      .has('analysis_document_id', analysis_document_id)
                      .outV(),
                      constant(False)) \
            .toList()

        result = loop.run_until_complete(query)
        if not result:
            # TODO: we are assuming that an analysis has always at
            # least some entries
            raise NotFoundError(
                f"No entries for runtime environment {runtime_environment_name!r} with "
                f"analysis document id {analysis_document_id!r} found")

        if result[0] is False:
            raise NotFoundError(
                f"No entries for runtime environment {runtime_environment_name!r} found"
            )

        return result, analysis_document_id
Ejemplo n.º 7
0
def recommend_similar_audience(g,
                               website_url,
                               categories_limit=3,
                               search_time_limit_in_seconds=15):
    """Given website url, categories_limit, categories_coin recommend similar audience in n most popular categories.

    Similar audience - audience of users that at least once visited subpage of domain that contains IAB-category codes
    that are most popular across users of given website
    """
    average_guy = (g.V(website_url).in_("visited").in_(
        "has_identity").dedup().hasLabel("persistentId").group().by().by(
            out("has_identity").out("visited").in_("links_to").groupCount().by(
                "categoryCode")).select(
                    Column.values).unfold().unfold().group().by(
                        Column.keys).by(select(
                            Column.values).mean()).unfold().order().by(
                                Column.values,
                                Order.desc).limit(categories_limit))

    most_popular_categories = dict(
        chain(*category.items()) for category in average_guy.toList())

    guy_stats_subquery = (out("has_identity").out("visited").in_(
        "links_to").groupCount().by("categoryCode").project(
            *most_popular_categories.keys()))

    conditions_subqueries = []
    for i in most_popular_categories:
        guy_stats_subquery = guy_stats_subquery.by(
            choose(select(i), select(i), constant(0)))
        conditions_subqueries.append(
            select(Column.values).unfold().select(i).is_(
                P.gt(int(most_popular_categories[i]))))

    return (g.V().hasLabel("websiteGroup").has(
        "categoryCode", P.within(list(
            most_popular_categories.keys()))).out("links_to").in_("visited").
            dedup().in_("has_identity").dedup().hasLabel("persistentId").where(
                out("has_identity").out("visited").has(
                    "url", P.neq(website_url))).timeLimit(
                        search_time_limit_in_seconds * 1000).local(
                            group().by().by(guy_stats_subquery).where(
                                or_(*conditions_subqueries))).select(
                                    Column.keys).unfold().out(
                                        "has_identity").values("uid"))
Ejemplo n.º 8
0
def get_levels(topo_traversal: GraphTraversalSource) -> List[List[Vertex]]:
    """ Gets the levels of the logical graph. The traversal starts with the
    source spouts and performs a breadth first search through the logically
    connected vertices.

    Arguments:
        topo_traversal (GraphTraversalSource):  A traversal source instance
                                                mapped to the topology subgraph
                                                whose levels are to be
                                                calculated.

    Returns:
        A list where each entry is a list of Vertex instances representing a
        level within the logical graph. The first level will be the spout
        instances.
    """

    # Only load the static enums we need so we don't pollute the globals dict
    keys = statics.staticEnums["keys"]
    values = statics.staticEnums["values"]
    local_scope = statics.staticEnums["local"]

    # Repeatedly traverse the tree defined by the logical connections, grouping
    # each group (or set because we us de-duplicate) of vertices by their depth
    # in the tree. This depth is based the current number of times the repeat
    # step has run (loops). So you end up with a map of integer depth to list
    # of vertices which is emitted by the cap step. After this we just put the
    # Hash Map in key order (ascending) and then take only the values (the
    # lists of vertices) and unfold them into a list.
    # The first group by(-1) statement is so that the spout vertices are
    # included at the top of the list
    levels: List[List[Vertex]] = (
        topo_traversal.V().hasLabel("spout").group("m").by(
            constant(-1)).repeat(
                out("logically_connected").dedup().group("m").by(
                    loops())).until(not_(outE("logically_connected"))).
        cap("m").order(local_scope).by(keys).select(values).unfold().toList())

    return levels