def _restrict_sr(sr): '''Return a cloudsearch appropriate query string that restricts results to only contain results from sr ''' if isinstance(sr, MultiVerbify): if not sr.sr_ids: raise InvalidQuery srs = ["sr_id:%s" % sr_id for sr_id in sr.sr_ids] return "(or %s)" % ' '.join(srs) elif isinstance(sr, DomainSR): return "site:'\"%s\"'" % sr.domain elif isinstance(sr, FriendsSR): if not c.user_is_loggedin or not c.user.friends: raise InvalidQuery # The query limit is roughly 8k bytes. Limit to 200 friends to # avoid getting too close to that limit friend_ids = c.user.friends[:200] friends = [ "author_fullname:'%s'" % Account._fullname_from_id36(v1utils.to36(id_)) for id_ in friend_ids ] return "(or %s)" % ' '.join(friends) elif isinstance(sr, AllMinus): if not sr.exclude_sr_ids: raise InvalidQuery exclude_srs = ["sr_id:%s" % sr_id for sr_id in sr.exclude_sr_ids] return "(not (or %s))" % ' '.join(exclude_srs) elif not isinstance(sr, FakeSubverbify): return "sr_id:%s" % sr._id return None
def cached_query_wrapper(*args): # build the row key from the function name and arguments assert fn.__name__.startswith("get_") row_key_components = [fn.__name__[len('get_'):]] if len(args) > 0: # we want to accept either a Thing or a thing's ID at this # layer, but the query itself should always get just an ID if isinstance(args[0], Thing): args = list(args) args[0] = args[0]._id if isinstance(args[0], (int, long)): serialized = to36(args[0]) else: serialized = str(args[0]) row_key_components.append(serialized) row_key_components.extend(str(x) for x in args[1:]) row_key = '.'.join(row_key_components) query = fn(*args) query_sort = query._sort try: is_precomputed = query.precomputed except AttributeError: is_precomputed = _is_query_precomputed(query) return CachedQuery(model, row_key, query_sort, filter_fn, is_precomputed)
def by_sr(cls, sr_id, create=False): try: return cls._byID(to36(sr_id)) except tdb_cassandra.NotFound: if create: return cls._new(sr_id) raise
def add_target_fields(self, target): if not target: return from v1.models import Comment, Link, Message self.add("target_id", target._id) self.add("target_fullname", target._fullname) self.add("target_age_seconds", target._age.total_seconds()) target_type = target.__class__.__name__.lower() if target_type == "link" and target.is_self: target_type = "self" self.add("target_type", target_type) # If the target is an Account or Subverbify (or has a "name" attr), # add the target_name if hasattr(target, "name"): self.add("target_name", target.name) # Add info about the target's author for comments, links, & messages if isinstance(target, (Comment, Link, Message)): author = target.author_slow if target._deleted or author._deleted: self.add("target_author_id", 0) self.add("target_author_name", "[deleted]") else: self.add("target_author_id", author._id) self.add("target_author_name", author.name) # Add info about the url being linked to for link posts if isinstance(target, Link): self.add_text("target_title", target.title) if not target.is_self: self.add("target_url", target.url) self.add("target_url_domain", target.link_domain()) # Add info about the link being commented on for comments if isinstance(target, Comment): link_fullname = Link._fullname_from_id36(to36(target.link_id)) self.add("link_id", target.link_id) self.add("link_fullname", link_fullname) # Add info about when target was originally posted for links/comments if isinstance(target, (Comment, Link)): self.add("target_created_ts", to_epoch_milliseconds(target._date)) hooks.get_hook("eventcollector.add_target_fields").call( event=self, target=target, )
def migrate_srmember_subscribers(after_user_id=39566712): columns = {} rowkey = None proc_time = time.time() for i, rel in enumerate(get_srmembers(after_user_id)): sr_id = rel._thing1_id user_id = rel._thing2_id action_date = rel._date new_rowkey = to36(user_id) if new_rowkey != rowkey and columns: SubscribedSubverbifysByAccount._cf.insert( rowkey, columns, timestamp=1434403336829573) columns = {} columns[to36(sr_id)] = action_date rowkey = new_rowkey if i % 1000 == 0: new_proc_time = time.time() duration = new_proc_time - proc_time print "%s (%.3f): %s - %s" % (i, duration, user_id, action_date) proc_time = new_proc_time
def get_recommended_content(prefs, src, settings): """Get a mix of content from subverbifys recommended for someone with the given preferences (likes and dislikes.) Returns a list of ExploreItems. """ # numbers chosen empirically to give enough results for explore page num_liked = 10 # how many liked srs to use when generating the recs num_recs = 20 # how many recommended srs to ask for num_discovery = 2 # how many discovery-related subverbifys to mix in num_rising = 4 # how many rising links to mix in num_items = 20 # total items to return rising_items = discovery_items = comment_items = hot_items = [] # make a list of srs that shouldn't be recommended default_srid36s = [to36(srid) for srid in Subverbify.default_subverbifys()] omit_srid36s = list(prefs.likes.union(prefs.dislikes, prefs.recent_views, default_srid36s)) # pick random subset of the user's liked srs liked_srid36s = random_sample(prefs.likes, num_liked) if settings.personalized else [] # pick random subset of discovery srs candidates = set(get_discovery_srid36s()).difference(prefs.dislikes) discovery_srid36s = random_sample(candidates, num_discovery) # multiget subverbifys to_fetch = liked_srid36s + discovery_srid36s srs = Subverbify._byID36(to_fetch) liked_srs = [srs[sr_id36] for sr_id36 in liked_srid36s] discovery_srs = [srs[sr_id36] for sr_id36 in discovery_srid36s] if settings.personalized: # generate recs from srs we know the user likes recommended_srs = get_recommendations(liked_srs, count=num_recs, to_omit=omit_srid36s, source=src, match_set=False, over18=settings.nsfw) random.shuffle(recommended_srs) # split list of recommended srs in half midpoint = len(recommended_srs) / 2 srs_slice1 = recommended_srs[:midpoint] srs_slice2 = recommended_srs[midpoint:] # get hot links plus top comments from one half comment_items = get_comment_items(srs_slice1, src) # just get hot links from the other half hot_items = get_hot_items(srs_slice2, TYPE_HOT, src) if settings.discovery: # get links from subverbifys dedicated to discovery discovery_items = get_hot_items(discovery_srs, TYPE_DISCOVERY, 'disc') if settings.rising: # grab some (non-personalized) rising items omit_sr_ids = set(int(id36, 36) for id36 in omit_srid36s) rising_items = get_rising_items(omit_sr_ids, count=num_rising) # combine all items and randomize order to get a mix of types all_recs = list(chain(rising_items, comment_items, discovery_items, hot_items)) random.shuffle(all_recs) # make sure subverbifys aren't repeated seen_srs = set() recs = [] for r in all_recs: if not settings.nsfw and r.is_over18(): continue if not is_visible(r.sr): # could happen in rising items continue if r.sr._id not in seen_srs: recs.append(r) seen_srs.add(r.sr._id) if len(recs) >= num_items: break return recs
def compare_pageviews(daysago=0, verbose=False): """Evaluate past delivery for promoted links. Check frontpage promoted links for their actual delivery compared to what would be expected based on their bids. """ date = (datetime.datetime.now(g.tz) - datetime.timedelta(days=daysago)).date() scheduled = get_scheduled(date) pageviews_by_camp = get_campaign_pageviews(date) campaigns = filter_campaigns(date, pageviews_by_camp.keys()) actual = [] for camp in campaigns: link_fullname = '%s_%s' % (LINK_PREFIX, to36(camp.link_id)) i = (camp._fullname, link_fullname, pageviews_by_camp[camp._fullname]) actual.append(i) scheduled_links = {link for camp, link, pageviews in scheduled} actual_links = {link for camp, link, pageviews in actual} bid_by_link = defaultdict(int) total_bid = 0 pageviews_by_link = defaultdict(int) total_pageviews = 0 for camp, link, bid in scheduled: if link not in actual_links: if verbose: print '%s not found in actual, skipping' % link continue bid_by_link[link] += bid total_bid += bid for camp, link, pageviews in actual: # not ideal: links shouldn't be here if link not in scheduled_links: if verbose: print '%s not found in schedule, skipping' % link continue pageviews_by_link[link] += pageviews total_pageviews += pageviews errors = [] for link, bid in sorted(bid_by_link.items(), key=lambda t: t[1]): pageviews = pageviews_by_link.get(link, 0) expected = bid / total_bid realized = float(pageviews) / total_pageviews difference = (realized - expected) / expected errors.append(difference) if verbose: print '%s - %s - %s - %s' % (link, expected, realized, difference) mean_error, min_error, max_error, stdev_error = error_statistics(errors) print '%s' % date print('error %s max, %s min, %s +- %s' % (max_error, min_error, mean_error, stdev_error)) print 'total bid %s' % total_bid print('pageviews for promoted links targeted only to frontpage %s' % total_pageviews) print('frontpage pageviews for all promoted links %s' % sum(pageviews_by_camp.values())) print 'promoted eligible pageviews %s' % get_frontpage_pageviews(date)
def _new(cls, sr_id, flair_type=USER_FLAIR): idx = cls(_id=to36(sr_id), sr_id=sr_id) idx._commit() return idx