def get_sibling_attrs(g, transient_id): """ Given transient id, get summary of information we have about it or its sibling nodes. We gather: * node attributes * IP / location information * IAB categories of visited websites """ return (g.V(transient_id).choose( in_("has_identity"), # check if this transient id has persistent id in_("has_identity").project( "identity_group_id", "persistent_id", "attributes", "ip_location", "iab_categories").by( in_("member").values("igid")).by(values("pid")).by( out("has_identity").valueMap().unfold().group().by( Column.keys).by( select(Column.values).unfold().dedup().fold())). by(out("has_identity").out("uses").dedup().valueMap().fold()).by( out("has_identity").out("visited").in_("links_to").values( "categoryCode").dedup().fold()), project("identity_group_id", "persistent_id", "attributes", "ip_location", "iab_categories").by(constant("")).by(constant("")).by( valueMap().unfold().group().by(Column.keys).by( select(Column.values).unfold().dedup().fold())).by( out("uses").dedup().valueMap().fold()).by( out("visited").in_("links_to").values( "categoryCode").dedup().fold())))
def undecided_user_audience_check(g, transient_id, website_url, thank_you_page_url, since, min_visited_count): """ Given transient id, check whether it belongs to an audience. It's simple yes, no question. User belongs to an audience whenever all of the following criteria are met: * visited some website url at least X times since specific timestamp * did not visit thank you page url since specific timestamp """ return (g.V(transient_id).hasLabel("transientId").in_("has_identity").out( "has_identity").outE("visited").has("ts", P.gt(since)).choose( has("visited_url", website_url), groupCount("visits").by(constant("page_visits"))).choose( has("visited_url", thank_you_page_url), groupCount("visits").by( constant("thank_you_page_vists"))).cap("visits").coalesce( and_( coalesce(select("thank_you_page_vists"), constant(0)).is_(0), select("page_visits").is_( P.gt(min_visited_count))).choose( count().is_(1), constant(True)), constant(False)))
def query_transient_nodes_for_website(g, website_id, limit=10000): return (g.V(website_id).in_("visited").limit(limit).project( "uid", "pid" ).by("uid").by(in_("has_identity").values("pid").fold()).group().by( coalesce( select("pid").unfold(), constant("transient-nodes-connected-to-website"))).by( select("uid").dedup().limit(100).fold()).unfold().project( "persistent-node-id", "transient-nodes").by( select(Column.keys)).by(select(Column.values)).where( select("transient-nodes").unfold().count().is_( P.gt(1)))).toList()
def check_duplicate_ip_addresses(g): """networks with duplicate ip addresses """ r = g.V().hasLabel("virtual_network").as_('vn').flatMap( union( select('vn'), __.in_().hasLabel("instance_ip").has("instance_ip_address").group( ).by("instance_ip_address").unfold().filter( lambda: "it.get().value.size() > 1")).fold().filter( lambda: "it.get().size() > 1")).toList() if len(r) > 0: printo('Found %d %s:' % (len(r), check_duplicate_ip_addresses.__doc__.strip())) for dup in r: # FIXME: dup[0].label = 'virtual_network' # First item is the vn r_ = v_to_r(dup[0]) printo(' - %s/%s - %s' % (r_.type, r_.uuid, r_.fq_name)) for ips in dup[1:]: for ip, iips in ips.items(): printo(" %s:" % ip) for iip in iips: r_ = v_to_r(iip) printo(' - %s/%s - %s' % (r_.type, r_.uuid, r_.fq_name)) return r
def recommend_similar_audience(g, website_url, categories_limit=3, search_time_limit_in_seconds=15): """Given website url, categories_limit, categories_coin recommend similar audience in n most popular categories. Similar audience - audience of users that at least once visited subpage of domain that contains IAB-category codes that are most popular across users of given website """ average_guy = (g.V(website_url).in_("visited").in_( "has_identity").dedup().hasLabel("persistentId").group().by().by( out("has_identity").out("visited").in_("links_to").groupCount().by( "categoryCode")).select( Column.values).unfold().unfold().group().by( Column.keys).by(select( Column.values).mean()).unfold().order().by( Column.values, Order.desc).limit(categories_limit)) most_popular_categories = dict( chain(*category.items()) for category in average_guy.toList()) guy_stats_subquery = (out("has_identity").out("visited").in_( "links_to").groupCount().by("categoryCode").project( *most_popular_categories.keys())) conditions_subqueries = [] for i in most_popular_categories: guy_stats_subquery = guy_stats_subquery.by( choose(select(i), select(i), constant(0))) conditions_subqueries.append( select(Column.values).unfold().select(i).is_( P.gt(int(most_popular_categories[i])))) return (g.V().hasLabel("websiteGroup").has( "categoryCode", P.within(list( most_popular_categories.keys()))).out("links_to").in_("visited"). dedup().in_("has_identity").dedup().hasLabel("persistentId").where( out("has_identity").out("visited").has( "url", P.neq(website_url))).timeLimit( search_time_limit_in_seconds * 1000).local( group().by().by(guy_stats_subquery).where( or_(*conditions_subqueries))).select( Column.keys).unfold().out( "has_identity").values("uid"))
def _query_users_activities_stats(g, website_url, most_popular_categories, search_time_limit_in_seconds=30): return (g.V().hasLabel("websiteGroup").has( "categoryCode", P.within(list( most_popular_categories.keys()))).out("links_to").in_("visited"). dedup().in_("has_identity").dedup().hasLabel("persistentId").where( out("has_identity").out("visited").has( "url", P.neq(website_url))).timeLimit( search_time_limit_in_seconds * 1000). local(group().by().by( out("has_identity").out("visited").in_( "links_to").groupCount().by("categoryCode")).project( "pid", "iabs", "tids").by(select(Column.keys).unfold()).by( select(Column.values).unfold()).by( select(Column.keys).unfold().out( "has_identity").values("uid").fold())))
def get_activity_of_early_adopters(g, thank_you_page_url, skip_single_transients=False, limit=5): """ Given thank you page url, find first early adopters of the product. In other words: * find first few persistent identities (or transient if they're not matched with any user) that visited given thank you page * extract their *whole* activity on the domain of the thank_you_page """ return (g.V(thank_you_page_url).hasLabel("website").as_("thank_you").in_( "links_to").as_("website_group").select("thank_you").inE( "visited").order().by("ts").choose( constant(skip_single_transients).is_(P.eq(True)), where(outV().in_("has_identity")), identity()).choose( outV().in_("has_identity"), project("type", "id", "purchase_ts").by(constant("persistent")).by( outV().in_("has_identity")).by(values("ts")), project("type", "id", "purchase_ts").by( constant("transient")).by(outV()).by(values("ts"))). dedup("id").limit(limit).choose( select("type").is_("persistent"), project("persistent_id", "transient_id", "purchase_ts").by(select("id").values("pid")).by( select("id").out("has_identity").fold()).by( select("purchase_ts")), project("persistent_id", "transient_id", "purchase_ts").by( constant("")).by(select("id").fold()).by( select("purchase_ts"))).project( "persistent_id", "purchase_ts", "devices", "visits").by(select("persistent_id")).by( select("purchase_ts")).by( select("transient_id").unfold().group().by( values("uid")).by(values("type"))). by( select("transient_id").unfold().outE("visited").order().by( "ts").where(inV().in_("links_to").where( P.eq("website_group"))).project( "transientId", "url", "ts").by("uid").by("visited_url").by("ts").fold()))
def _get_categories_popular_across_audience_of_website(g, website_url, categories_limit=3): return (g.V(website_url).in_("visited").in_( "has_identity").dedup().hasLabel("persistentId").group().by().by( out("has_identity").out("visited").in_("links_to").groupCount().by( "categoryCode")).select( Column.values).unfold().unfold().group().by( Column.keys).by(select( Column.values).mean()).unfold().order().by( Column.values, Order.desc).limit(categories_limit))
def query_users_active_in_given_date_intervals(g, dt_conditions, limit=300): """Get users (persistent identities) that interacted with website in given date interval.""" return (g.V().hasLabel("persistentId").coin(0.5).limit(limit).where( out("has_identity").outE("visited").or_(*dt_conditions)).project( "persistent_id", "attributes", "ip_location").by(values("pid")).by( out("has_identity").valueMap( "browser", "email", "uid").unfold().group().by(Column.keys).by( select(Column.values).unfold().dedup().fold())).by( out("has_identity").out( "uses").dedup().valueMap().fold()))
def query_users_intersted_in_content(g, iab_codes, limit=10000): """Get users (persistent identities) that interacted with websites with given iab codes.""" return (g.V().hasLabel("persistentId").coin(0.8).limit(limit).where( out("has_identity").out("visited").in_("links_to").has( "categoryCode", P.within(iab_codes))).project( "persistent_id", "attributes", "ip_location").by(values("pid")).by( out("has_identity").valueMap( "browser", "email", "uid").unfold().group().by(Column.keys).by( select(Column.values).unfold().dedup().fold())).by( out("has_identity").out( "uses").dedup().valueMap().fold()))
def undecided_users_audience(g, website_url, thank_you_page_url, since, min_visited_count): """ Given website url, get all the users that meet audience conditions. It returns list of transient identities uids. Audience is build from the users that met following criteria: * visited some website url at least X times since specific timestamp * did not visit thank you page url since specific timestamp """ return (g.V(website_url).hasLabel("website").inE("visited").has( "ts", P.gt(since)).outV().in_("has_identity").groupCount().unfold( ).dedup().where(select(Column.values).is_( P.gt(min_visited_count))).select(Column.keys).as_("pids").map( out("has_identity").outE("visited").has( "visited_url", thank_you_page_url).has("ts", P.gt(since)).outV().in_( "has_identity").dedup().values("pid").fold()).as_( "pids_that_visited").select("pids").not_( has("pid", where(P.within("pids_that_visited")))).out( "has_identity").values("uid"))
def _get_persistent_ids_which_visited_website(g, root_url): return (g.V(root_url).aggregate("root_url").in_("visited").in_( "has_identity").dedup().limit(50).fold().project( "root_url", "persistent_ids").by( select("root_url").unfold().valueMap(True)).by())