async def initialize(self): connection = await init_neptune_connection() async with connection: g = Graph().traversal().withRemote(connection) data = await (g.V().hasLabel("transientId").coin(COIN).limit( ARG_COLLECTION).group().by().by( outE("visited").coin(COIN).inV().in_("links_to").out( "links_to").coin(COIN).path().by(values("uid")).by( values("ts")).by(values("url")).by( values("url")).by(values("url"))).select( Column.values).unfold()).toList() self.args = [{ "transient_id": result[0], "website_url": result[2], "thank_you_page_url": result[4], "since": result[1] - timedelta(days=random.randint(30, 60)), "min_visited_count": random.randint(2, 5) } for result in data if result]
async def initialize(self): connection = await init_neptune_connection() async with connection: g = Graph().traversal().withRemote(connection) most_visited_websites = await get_most_active_websites(g) data = await (g.V(most_visited_websites).group().by().by( inE().hasLabel("visited").coin(COIN).inV().in_( "links_to").out("links_to").coin(COIN).path().by( values("url")) # visited website .by(values("ts")) # timestamp .by(values("url")) # visited website .by(values("url")) # root website .by(values("url").limit(1)) # thank you page ).select(Column.values).unfold()).toList() self.args = [{ "website_url": result[0], "thank_you_page_url": result[4], "since": result[1] - timedelta(days=random.randint(30, 60)), "min_visited_count": random.randint(2, 5) } for result in data]
def get_sibling_attrs(g, transient_id): """ Given transient id, get summary of information we have about it or its sibling nodes. We gather: * node attributes * IP / location information * IAB categories of visited websites """ return (g.V(transient_id).choose( in_("has_identity"), # check if this transient id has persistent id in_("has_identity").project( "identity_group_id", "persistent_id", "attributes", "ip_location", "iab_categories").by( in_("member").values("igid")).by(values("pid")).by( out("has_identity").valueMap().unfold().group().by( Column.keys).by( select(Column.values).unfold().dedup().fold())). by(out("has_identity").out("uses").dedup().valueMap().fold()).by( out("has_identity").out("visited").in_("links_to").values( "categoryCode").dedup().fold()), project("identity_group_id", "persistent_id", "attributes", "ip_location", "iab_categories").by(constant("")).by(constant("")).by( valueMap().unfold().group().by(Column.keys).by( select(Column.values).unfold().dedup().fold())).by( out("uses").dedup().valueMap().fold()).by( out("visited").in_("links_to").values( "categoryCode").dedup().fold())))
def get_activity_of_early_adopters(g, thank_you_page_url, skip_single_transients=False, limit=5): """ Given thank you page url, find first early adopters of the product. In other words: * find first few persistent identities (or transient if they're not matched with any user) that visited given thank you page * extract their *whole* activity on the domain of the thank_you_page """ return (g.V(thank_you_page_url).hasLabel("website").as_("thank_you").in_( "links_to").as_("website_group").select("thank_you").inE( "visited").order().by("ts").choose( constant(skip_single_transients).is_(P.eq(True)), where(outV().in_("has_identity")), identity()).choose( outV().in_("has_identity"), project("type", "id", "purchase_ts").by(constant("persistent")).by( outV().in_("has_identity")).by(values("ts")), project("type", "id", "purchase_ts").by( constant("transient")).by(outV()).by(values("ts"))). dedup("id").limit(limit).choose( select("type").is_("persistent"), project("persistent_id", "transient_id", "purchase_ts").by(select("id").values("pid")).by( select("id").out("has_identity").fold()).by( select("purchase_ts")), project("persistent_id", "transient_id", "purchase_ts").by( constant("")).by(select("id").fold()).by( select("purchase_ts"))).project( "persistent_id", "purchase_ts", "devices", "visits").by(select("persistent_id")).by( select("purchase_ts")).by( select("transient_id").unfold().group().by( values("uid")).by(values("type"))). by( select("transient_id").unfold().outE("visited").order().by( "ts").where(inV().in_("links_to").where( P.eq("website_group"))).project( "transientId", "url", "ts").by("uid").by("visited_url").by("ts").fold()))
def _get_subgraph(g, website_url, thank_you_page_url, since): return (g.V().hasLabel("website").has( "url", P.within([ website_url, thank_you_page_url ])).in_("visited").in_("has_identity").dedup().limit(20).project( "persistent_id", "transient_ids", "visited_events").by( values("pid")).by(out("has_identity").values("uid").fold()).by( out("has_identity").outE("visited").has( "visited_url", P.within([website_url, thank_you_page_url ])).valueMap("visited_url", "ts", "uid").dedup().fold()))
def wrapper(*args): t = fun(*args) # we should be able to fold() fq_name: https://issues.apache.org/jira/browse/TINKERPOP-1711 r = t.map(union(label(), id(), values('fq_name')).fold()).toList() # convert gremlin result in [Resource] resources = [] for r_ in r: res_type = r_[0].replace('_', '-') uuid = r_[1]["@value"] fq_name = r_[2] resources.append(Resource(res_type, uuid=uuid, fq_name=fq_name)) return resources
def query_users_active_in_given_date_intervals(g, dt_conditions, limit=300): """Get users (persistent identities) that interacted with website in given date interval.""" return (g.V().hasLabel("persistentId").coin(0.5).limit(limit).where( out("has_identity").outE("visited").or_(*dt_conditions)).project( "persistent_id", "attributes", "ip_location").by(values("pid")).by( out("has_identity").valueMap( "browser", "email", "uid").unfold().group().by(Column.keys).by( select(Column.values).unfold().dedup().fold())).by( out("has_identity").out( "uses").dedup().valueMap().fold()))
def wrapper(*args): t = fun(*args) r = t.map( union(label(), id(), coalesce(values('fq_name'), constant(''))).fold()).toList() # convert gremlin result in [Resource] resources = [] for r_ in r: res_type = r_[0].replace('_', '-') uuid = text_type(r_[1]) fq_name = r_[2] resources.append(Resource(res_type, uuid=uuid, fq_name=fq_name)) return resources
def query_users_intersted_in_content(g, iab_codes, limit=10000): """Get users (persistent identities) that interacted with websites with given iab codes.""" return (g.V().hasLabel("persistentId").coin(0.8).limit(limit).where( out("has_identity").out("visited").in_("links_to").has( "categoryCode", P.within(iab_codes))).project( "persistent_id", "attributes", "ip_location").by(values("pid")).by( out("has_identity").valueMap( "browser", "email", "uid").unfold().group().by(Column.keys).by( select(Column.values).unfold().dedup().fold())).by( out("has_identity").out( "uses").dedup().valueMap().fold()))