Ejemplo n.º 1
0
    async def initialize(self):
        connection = await init_neptune_connection()
        async with connection:
            g = Graph().traversal().withRemote(connection)

            data = await (g.V().hasLabel("transientId").coin(COIN).limit(
                ARG_COLLECTION).group().by().by(
                    outE("visited").coin(COIN).inV().in_("links_to").out(
                        "links_to").coin(COIN).path().by(values("uid")).by(
                            values("ts")).by(values("url")).by(
                                values("url")).by(values("url"))).select(
                                    Column.values).unfold()).toList()

            self.args = [{
                "transient_id":
                result[0],
                "website_url":
                result[2],
                "thank_you_page_url":
                result[4],
                "since":
                result[1] - timedelta(days=random.randint(30, 60)),
                "min_visited_count":
                random.randint(2, 5)
            } for result in data if result]
Ejemplo n.º 2
0
    async def initialize(self):
        connection = await init_neptune_connection()
        async with connection:
            g = Graph().traversal().withRemote(connection)

            most_visited_websites = await get_most_active_websites(g)
            data = await (g.V(most_visited_websites).group().by().by(
                inE().hasLabel("visited").coin(COIN).inV().in_(
                    "links_to").out("links_to").coin(COIN).path().by(
                        values("url"))  # visited website
                .by(values("ts"))  # timestamp
                .by(values("url"))  # visited website
                .by(values("url"))  # root website
                .by(values("url").limit(1))  # thank you page
            ).select(Column.values).unfold()).toList()

            self.args = [{
                "website_url":
                result[0],
                "thank_you_page_url":
                result[4],
                "since":
                result[1] - timedelta(days=random.randint(30, 60)),
                "min_visited_count":
                random.randint(2, 5)
            } for result in data]
Ejemplo n.º 3
0
def get_sibling_attrs(g, transient_id):
    """
    Given transient id, get summary of information we have about it or its sibling nodes.

    We gather:
        * node attributes
        * IP / location information
        * IAB categories of visited websites
    """
    return (g.V(transient_id).choose(
        in_("has_identity"),  # check if this transient id has persistent id
        in_("has_identity").project(
            "identity_group_id", "persistent_id",
            "attributes", "ip_location", "iab_categories").by(
                in_("member").values("igid")).by(values("pid")).by(
                    out("has_identity").valueMap().unfold().group().by(
                        Column.keys).by(
                            select(Column.values).unfold().dedup().fold())).
        by(out("has_identity").out("uses").dedup().valueMap().fold()).by(
            out("has_identity").out("visited").in_("links_to").values(
                "categoryCode").dedup().fold()),
        project("identity_group_id", "persistent_id", "attributes",
                "ip_location",
                "iab_categories").by(constant("")).by(constant("")).by(
                    valueMap().unfold().group().by(Column.keys).by(
                        select(Column.values).unfold().dedup().fold())).by(
                            out("uses").dedup().valueMap().fold()).by(
                                out("visited").in_("links_to").values(
                                    "categoryCode").dedup().fold())))
Ejemplo n.º 4
0
def get_activity_of_early_adopters(g,
                                   thank_you_page_url,
                                   skip_single_transients=False,
                                   limit=5):
    """
    Given thank you page url, find first early adopters of the product.

    In other words:
        * find first few persistent identities (or transient if they're not matched with any user)
          that visited given thank you page
        * extract their *whole* activity on the domain of the thank_you_page
    """
    return (g.V(thank_you_page_url).hasLabel("website").as_("thank_you").in_(
        "links_to").as_("website_group").select("thank_you").inE(
            "visited").order().by("ts").choose(
                constant(skip_single_transients).is_(P.eq(True)),
                where(outV().in_("has_identity")), identity()).choose(
                    outV().in_("has_identity"),
                    project("type", "id",
                            "purchase_ts").by(constant("persistent")).by(
                                outV().in_("has_identity")).by(values("ts")),
                    project("type", "id", "purchase_ts").by(
                        constant("transient")).by(outV()).by(values("ts"))).
            dedup("id").limit(limit).choose(
                select("type").is_("persistent"),
                project("persistent_id", "transient_id",
                        "purchase_ts").by(select("id").values("pid")).by(
                            select("id").out("has_identity").fold()).by(
                                select("purchase_ts")),
                project("persistent_id", "transient_id", "purchase_ts").by(
                    constant("")).by(select("id").fold()).by(
                        select("purchase_ts"))).project(
                            "persistent_id", "purchase_ts", "devices",
                            "visits").by(select("persistent_id")).by(
                                select("purchase_ts")).by(
                                    select("transient_id").unfold().group().by(
                                        values("uid")).by(values("type"))).
            by(
                select("transient_id").unfold().outE("visited").order().by(
                    "ts").where(inV().in_("links_to").where(
                        P.eq("website_group"))).project(
                            "transientId", "url",
                            "ts").by("uid").by("visited_url").by("ts").fold()))
def _get_subgraph(g, website_url, thank_you_page_url, since):
    return (g.V().hasLabel("website").has(
        "url", P.within([
            website_url, thank_you_page_url
        ])).in_("visited").in_("has_identity").dedup().limit(20).project(
            "persistent_id", "transient_ids", "visited_events").by(
                values("pid")).by(out("has_identity").values("uid").fold()).by(
                    out("has_identity").outE("visited").has(
                        "visited_url",
                        P.within([website_url, thank_you_page_url
                                  ])).valueMap("visited_url", "ts",
                                               "uid").dedup().fold()))
Ejemplo n.º 6
0
 def wrapper(*args):
     t = fun(*args)
     # we should be able to fold() fq_name: https://issues.apache.org/jira/browse/TINKERPOP-1711
     r = t.map(union(label(), id(), values('fq_name')).fold()).toList()
     # convert gremlin result in [Resource]
     resources = []
     for r_ in r:
         res_type = r_[0].replace('_', '-')
         uuid = r_[1]["@value"]
         fq_name = r_[2]
         resources.append(Resource(res_type, uuid=uuid, fq_name=fq_name))
     return resources
Ejemplo n.º 7
0
def query_users_active_in_given_date_intervals(g, dt_conditions, limit=300):
    """Get users (persistent identities) that interacted with website in given date interval."""

    return (g.V().hasLabel("persistentId").coin(0.5).limit(limit).where(
        out("has_identity").outE("visited").or_(*dt_conditions)).project(
            "persistent_id", "attributes", "ip_location").by(values("pid")).by(
                out("has_identity").valueMap(
                    "browser", "email",
                    "uid").unfold().group().by(Column.keys).by(
                        select(Column.values).unfold().dedup().fold())).by(
                            out("has_identity").out(
                                "uses").dedup().valueMap().fold()))
Ejemplo n.º 8
0
 def wrapper(*args):
     t = fun(*args)
     r = t.map(
         union(label(), id(), coalesce(values('fq_name'),
                                       constant(''))).fold()).toList()
     # convert gremlin result in [Resource]
     resources = []
     for r_ in r:
         res_type = r_[0].replace('_', '-')
         uuid = text_type(r_[1])
         fq_name = r_[2]
         resources.append(Resource(res_type, uuid=uuid, fq_name=fq_name))
     return resources
Ejemplo n.º 9
0
def query_users_intersted_in_content(g, iab_codes, limit=10000):
    """Get users (persistent identities) that interacted with websites with given iab codes."""

    return (g.V().hasLabel("persistentId").coin(0.8).limit(limit).where(
        out("has_identity").out("visited").in_("links_to").has(
            "categoryCode", P.within(iab_codes))).project(
                "persistent_id", "attributes",
                "ip_location").by(values("pid")).by(
                    out("has_identity").valueMap(
                        "browser", "email",
                        "uid").unfold().group().by(Column.keys).by(
                            select(Column.values).unfold().dedup().fold())).by(
                                out("has_identity").out(
                                    "uses").dedup().valueMap().fold()))