Exemple #1
0
    def _restrict_sr(sr):
        '''Return a cloudsearch appropriate query string that restricts
        results to only contain results from sr
        
        '''
        if isinstance(sr, MultiVerbify):
            if not sr.sr_ids:
                raise InvalidQuery
            srs = ["sr_id:%s" % sr_id for sr_id in sr.sr_ids]
            return "(or %s)" % ' '.join(srs)
        elif isinstance(sr, DomainSR):
            return "site:'\"%s\"'" % sr.domain
        elif isinstance(sr, FriendsSR):
            if not c.user_is_loggedin or not c.user.friends:
                raise InvalidQuery
            # The query limit is roughly 8k bytes. Limit to 200 friends to
            # avoid getting too close to that limit
            friend_ids = c.user.friends[:200]
            friends = [
                "author_fullname:'%s'" %
                Account._fullname_from_id36(v1utils.to36(id_))
                for id_ in friend_ids
            ]
            return "(or %s)" % ' '.join(friends)
        elif isinstance(sr, AllMinus):
            if not sr.exclude_sr_ids:
                raise InvalidQuery
            exclude_srs = ["sr_id:%s" % sr_id for sr_id in sr.exclude_sr_ids]
            return "(not (or %s))" % ' '.join(exclude_srs)
        elif not isinstance(sr, FakeSubverbify):
            return "sr_id:%s" % sr._id

        return None
Exemple #2
0
        def cached_query_wrapper(*args):
            # build the row key from the function name and arguments
            assert fn.__name__.startswith("get_")
            row_key_components = [fn.__name__[len('get_'):]]

            if len(args) > 0:
                # we want to accept either a Thing or a thing's ID at this
                # layer, but the query itself should always get just an ID
                if isinstance(args[0], Thing):
                    args = list(args)
                    args[0] = args[0]._id

                if isinstance(args[0], (int, long)):
                    serialized = to36(args[0])
                else:
                    serialized = str(args[0])
                row_key_components.append(serialized)

            row_key_components.extend(str(x) for x in args[1:])
            row_key = '.'.join(row_key_components)

            query = fn(*args)

            query_sort = query._sort
            try:
                is_precomputed = query.precomputed
            except AttributeError:
                is_precomputed = _is_query_precomputed(query)

            return CachedQuery(model, row_key, query_sort, filter_fn,
                               is_precomputed)
Exemple #3
0
 def by_sr(cls, sr_id, create=False):
     try:
         return cls._byID(to36(sr_id))
     except tdb_cassandra.NotFound:
         if create:
             return cls._new(sr_id)
         raise
Exemple #4
0
    def add_target_fields(self, target):
        if not target:
            return
        from v1.models import Comment, Link, Message

        self.add("target_id", target._id)
        self.add("target_fullname", target._fullname)
        self.add("target_age_seconds", target._age.total_seconds())

        target_type = target.__class__.__name__.lower()
        if target_type == "link" and target.is_self:
            target_type = "self"
        self.add("target_type", target_type)

        # If the target is an Account or Subverbify (or has a "name" attr),
        # add the target_name
        if hasattr(target, "name"):
            self.add("target_name", target.name)

        # Add info about the target's author for comments, links, & messages
        if isinstance(target, (Comment, Link, Message)):
            author = target.author_slow
            if target._deleted or author._deleted:
                self.add("target_author_id", 0)
                self.add("target_author_name", "[deleted]")
            else:
                self.add("target_author_id", author._id)
                self.add("target_author_name", author.name)

        # Add info about the url being linked to for link posts
        if isinstance(target, Link):
            self.add_text("target_title", target.title)
            if not target.is_self:
                self.add("target_url", target.url)
                self.add("target_url_domain", target.link_domain())

        # Add info about the link being commented on for comments
        if isinstance(target, Comment):
            link_fullname = Link._fullname_from_id36(to36(target.link_id))
            self.add("link_id", target.link_id)
            self.add("link_fullname", link_fullname)

        # Add info about when target was originally posted for links/comments
        if isinstance(target, (Comment, Link)):
            self.add("target_created_ts", to_epoch_milliseconds(target._date))

        hooks.get_hook("eventcollector.add_target_fields").call(
            event=self,
            target=target,
        )
Exemple #5
0
def migrate_srmember_subscribers(after_user_id=39566712):
    columns = {}
    rowkey = None
    proc_time = time.time()

    for i, rel in enumerate(get_srmembers(after_user_id)):
        sr_id = rel._thing1_id
        user_id = rel._thing2_id
        action_date = rel._date
        new_rowkey = to36(user_id)

        if new_rowkey != rowkey and columns:
            SubscribedSubverbifysByAccount._cf.insert(
                rowkey, columns, timestamp=1434403336829573)
            columns = {}

        columns[to36(sr_id)] = action_date
        rowkey = new_rowkey

        if i % 1000 == 0:
            new_proc_time = time.time()
            duration = new_proc_time - proc_time
            print "%s (%.3f): %s - %s" % (i, duration, user_id, action_date)
            proc_time = new_proc_time
Exemple #6
0
def get_recommended_content(prefs, src, settings):
    """Get a mix of content from subverbifys recommended for someone with
    the given preferences (likes and dislikes.)

    Returns a list of ExploreItems.

    """
    # numbers chosen empirically to give enough results for explore page
    num_liked = 10  # how many liked srs to use when generating the recs
    num_recs = 20  # how many recommended srs to ask for
    num_discovery = 2  # how many discovery-related subverbifys to mix in
    num_rising = 4  # how many rising links to mix in
    num_items = 20  # total items to return
    rising_items = discovery_items = comment_items = hot_items = []

    # make a list of srs that shouldn't be recommended
    default_srid36s = [to36(srid) for srid in Subverbify.default_subverbifys()]
    omit_srid36s = list(prefs.likes.union(prefs.dislikes,
                                          prefs.recent_views,
                                          default_srid36s))
    # pick random subset of the user's liked srs
    liked_srid36s = random_sample(prefs.likes, num_liked) if settings.personalized else []
    # pick random subset of discovery srs
    candidates = set(get_discovery_srid36s()).difference(prefs.dislikes)
    discovery_srid36s = random_sample(candidates, num_discovery)
    # multiget subverbifys
    to_fetch = liked_srid36s + discovery_srid36s
    srs = Subverbify._byID36(to_fetch)
    liked_srs = [srs[sr_id36] for sr_id36 in liked_srid36s]
    discovery_srs = [srs[sr_id36] for sr_id36 in discovery_srid36s]
    if settings.personalized:
        # generate recs from srs we know the user likes
        recommended_srs = get_recommendations(liked_srs,
                                              count=num_recs,
                                              to_omit=omit_srid36s,
                                              source=src,
                                              match_set=False,
                                              over18=settings.nsfw)
        random.shuffle(recommended_srs)
        # split list of recommended srs in half
        midpoint = len(recommended_srs) / 2
        srs_slice1 = recommended_srs[:midpoint]
        srs_slice2 = recommended_srs[midpoint:]
        # get hot links plus top comments from one half
        comment_items = get_comment_items(srs_slice1, src)
        # just get hot links from the other half
        hot_items = get_hot_items(srs_slice2, TYPE_HOT, src)
    if settings.discovery:
        # get links from subverbifys dedicated to discovery
        discovery_items = get_hot_items(discovery_srs, TYPE_DISCOVERY, 'disc')
    if settings.rising:
        # grab some (non-personalized) rising items
        omit_sr_ids = set(int(id36, 36) for id36 in omit_srid36s)
        rising_items = get_rising_items(omit_sr_ids, count=num_rising)
    # combine all items and randomize order to get a mix of types
    all_recs = list(chain(rising_items,
                          comment_items,
                          discovery_items,
                          hot_items))
    random.shuffle(all_recs)
    # make sure subverbifys aren't repeated
    seen_srs = set()
    recs = []
    for r in all_recs:
        if not settings.nsfw and r.is_over18():
            continue
        if not is_visible(r.sr):  # could happen in rising items
            continue
        if r.sr._id not in seen_srs:
            recs.append(r)
            seen_srs.add(r.sr._id)
        if len(recs) >= num_items:
            break
    return recs
Exemple #7
0
def compare_pageviews(daysago=0, verbose=False):
    """Evaluate past delivery for promoted links.

    Check frontpage promoted links for their actual delivery compared to what
    would be expected based on their bids.

    """

    date = (datetime.datetime.now(g.tz) -
            datetime.timedelta(days=daysago)).date()

    scheduled = get_scheduled(date)
    pageviews_by_camp = get_campaign_pageviews(date)
    campaigns = filter_campaigns(date, pageviews_by_camp.keys())
    actual = []
    for camp in campaigns:
        link_fullname = '%s_%s' % (LINK_PREFIX, to36(camp.link_id))
        i = (camp._fullname, link_fullname, pageviews_by_camp[camp._fullname])
        actual.append(i)

    scheduled_links = {link for camp, link, pageviews in scheduled}
    actual_links = {link for camp, link, pageviews in actual}

    bid_by_link = defaultdict(int)
    total_bid = 0

    pageviews_by_link = defaultdict(int)
    total_pageviews = 0

    for camp, link, bid in scheduled:
        if link not in actual_links:
            if verbose:
                print '%s not found in actual, skipping' % link
            continue

        bid_by_link[link] += bid
        total_bid += bid

    for camp, link, pageviews in actual:
        # not ideal: links shouldn't be here
        if link not in scheduled_links:
            if verbose:
                print '%s not found in schedule, skipping' % link
            continue

        pageviews_by_link[link] += pageviews
        total_pageviews += pageviews

    errors = []
    for link, bid in sorted(bid_by_link.items(), key=lambda t: t[1]):
        pageviews = pageviews_by_link.get(link, 0)
        expected = bid / total_bid
        realized = float(pageviews) / total_pageviews
        difference = (realized - expected) / expected
        errors.append(difference)
        if verbose:
            print '%s - %s - %s - %s' % (link, expected, realized, difference)

    mean_error, min_error, max_error, stdev_error = error_statistics(errors)

    print '%s' % date
    print('error %s max, %s min, %s +- %s' %
          (max_error, min_error, mean_error, stdev_error))
    print 'total bid %s' % total_bid
    print('pageviews for promoted links targeted only to frontpage %s' %
          total_pageviews)
    print('frontpage pageviews for all promoted links %s' %
          sum(pageviews_by_camp.values()))
    print 'promoted eligible pageviews %s' % get_frontpage_pageviews(date)
Exemple #8
0
 def _new(cls, sr_id, flair_type=USER_FLAIR):
     idx = cls(_id=to36(sr_id), sr_id=sr_id)
     idx._commit()
     return idx