def update_flair_counts():
    flairs = Counter()
    user_ids = []

    sr = Subreddit._byID(g.live_config["thebutton_srid"], data=True)
    raw = AccountsActiveBySR._cf.xget(sr._id36)
    for uid, _ in raw:
        user_ids.append(uid)

    for user_chunk in in_chunks(user_ids, size=100):
        users = Account._byID36(user_chunk, data=True, return_dict=False)
        for user in users:
            flair = user.flair_css_class(sr._id)
            if not flair:
                if user._date < ACCOUNT_CREATION_CUTOFF:
                    flair = "no-press"
                else:
                    flair = "cant-press"

            flairs[flair] += 1

    if 'cheater' in flairs:
        del flairs['cheater']

    sr.flair_counts = sorted(
        flairs.iteritems(),
        key=lambda x: 'z' if x[0] == 'no-press' else x[0],
        reverse=True)
    sr._commit()
Exemple #2
0
def update_flair_counts():
    flairs = Counter()
    user_ids = []

    sr = Subreddit._byID(g.live_config["thebutton_srid"], data=True)
    raw = [ba._id36 for ba in ButtonActivity._all()]

    for user_chunk in in_chunks(user_ids, size=100):
        users = Account._byID36(user_chunk, data=True, return_dict=False)
        for user in users:
            flair = user.flair_css_class(sr._id)
            if not flair:
                if user._date < ACCOUNT_CREATION_CUTOFF:
                    flair = "no-press"
                else:
                    flair = "cant-press"

            flairs[flair] += 1

    if 'cheater' in flairs:
        del flairs['cheater']

    sr.flair_counts = sorted(flairs.iteritems(),
                             key=lambda x: 'z' if x[0] == 'no-press' else x[0],
                             reverse=True)
    sr._commit()
Exemple #3
0
def _rebuild_link_index(start_at=None, sleeptime=1, cls=Link,
                       uploader=SolrLinkUploader, estimate=50000000, 
                       chunk_size=1000):
    uploader = uploader()

    q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'))

    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)

    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        uploader.things = chunk
        uploader.fullnames = [c._fullname for c in chunk] 
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        print "last updated %s" % last_update._fullname
        time.sleep(sleeptime)
Exemple #4
0
def rebuild_index(start_at=None, sleeptime=1, cls=Link, estimate=50000000,
                  chunk_size=1000):
    if start_at is _REBUILD_INDEX_CACHE_KEY:
        start_at = g.cache.get(start_at)
        if not start_at:
            raise ValueError("Told me to use '%s' key, but it's not set" %
                             _REBUILD_INDEX_CACHE_KEY)
    
    q = cls._query(cls.c._deleted == (True, False),
                   sort=desc('_date'), data=True)
    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)
    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        for x in range(5):
            try:
                inject(chunk)
            except httplib.HTTPException as err:
                print "Got  %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        g.cache.set(_REBUILD_INDEX_CACHE_KEY, last_update._fullname)
        time.sleep(sleeptime)
def update_activity():
    events = {}
    event_counts = collections.Counter()

    query = (ev for ev in LiveUpdateEvent._all()
             if ev.state == "live" and not ev.banned)
    for chunk in utils.in_chunks(query, size=100):
        context_ids = {ev._fullname: ev._id for ev in chunk}

        view_countable = [ev._fullname for ev in chunk
                          if ev._date >= g.liveupdate_min_date_viewcounts]
        view_counts_query = ViewCountsQuery.execute_async(view_countable)

        try:
            with c.activity_service.retrying(attempts=4) as svc:
                infos = svc.count_activity_multi(context_ids.keys())
        except TTransportException:
            continue

        view_counts = view_counts_query.result()

        for context_id, info in infos.iteritems():
            event_id = context_ids[context_id]

            try:
                LiveUpdateActivityHistoryByEvent.record_activity(
                    event_id, info.count)
            except tdb_cassandra.TRANSIENT_EXCEPTIONS as e:
                g.log.warning("Failed to update activity history for %r: %s",
                              event_id, e)

            try:
                event = LiveUpdateEvent.update_activity(
                    event_id, info.count, info.is_fuzzed)
            except tdb_cassandra.TRANSIENT_EXCEPTIONS as e:
                g.log.warning("Failed to update event activity for %r: %s",
                              event_id, e)
            else:
                events[event_id] = event
                event_counts[event_id] = info.count

            websockets.send_broadcast(
                "/live/" + event_id,
                type="activity",
                payload={
                    "count": info.count,
                    "fuzzed": info.is_fuzzed,
                    "total_views": view_counts.get(context_id),
                },
            )

    top_event_ids = [event_id for event_id, count in event_counts.most_common(1000)]
    top_events = [events[event_id] for event_id in top_event_ids]
    query_ttl = datetime.timedelta(days=3)
    with CachedQueryMutator() as m:
        m.replace(get_active_events(), top_events, ttl=query_ttl)

    # ensure that all the amqp messages we've put on the worker's queue are
    # sent before we allow this script to exit.
    amqp.worker.join()
Exemple #6
0
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link,
                       uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API',
                       estimate=50000000, chunk_size=1000):
    doc_api = getattr(g, doc_api)
    uploader = uploader(doc_api)

    q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'))

    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)

    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        uploader.things = chunk
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        print "last updated %s" % last_update._fullname
        time.sleep(sleeptime)
Exemple #7
0
def activate_names_requested_in(link):
    tree = get_comment_tree(link)
    acceptable_names = []
    if tree.tree:
        top_level_cids = tree.tree[None]
        comments = chain.from_iterable(Comment._byID(chunk, return_dict=False,
                                                     data=True)
                                       for chunk in in_chunks(top_level_cids))

        for comment in sorted(comments, key=lambda c: c._ups, reverse=True):
            if comment._spam or comment._deleted:
                continue

            sanitized = comment.body.strip()
            match = valid_name_re.search(sanitized)
            if match:
                acceptable_names.append((comment, match.group(1)))

    # we activate one name for each 100% of rev goal met
    names = acceptable_names[:link.revenue_bucket]
    activate_names(link, names)

    activated_names = [name for comment, name in names]
    link.server_names = activated_names
    link.flair_text = ", ".join(activated_names) if names else "/dev/null"
    link.flair_css_class = "goal-bucket-%d" % link.revenue_bucket
    link._commit()
Exemple #8
0
def _generate_sitemaps(links, set_lastmod=True):
    """Create an iterator of sitemaps.

    Each sitemap has up to 50000 links, being the maximum allowable number of
    links according to the sitemap standard.
    """
    for subreddit_chunks in in_chunks(links, LINKS_PER_SITEMAP):
        yield generate_sitemap_from_links(subreddit_chunks, set_lastmod=set_lastmod)
Exemple #9
0
def subreddit_sitemaps(subreddits):
    """Create an array of sitemaps.

    Each sitemap has up to 50000 links, being the maximum allowable number of
    links according to the sitemap standard.
    """
    for subreddit_chunks in in_chunks(subreddits, LINKS_PER_SITEMAP):
        yield _subreddit_sitemap(subreddit_chunks)
Exemple #10
0
def shorten_byurl_keys():
    """We changed by_url keys from a format like
           byurl_google.com...
       to:
           byurl(1d5920f4b44b27a802bd77c4f0536f5a, google.com...)
       so that they would fit in memcache's 251-char limit
    """

    from datetime import datetime
    from hashlib import md5
    from r2.models import Link
    from r2.lib.filters import _force_utf8
    from pylons import g
    from r2.lib.utils import fetch_things2, in_chunks
    from r2.lib.db.operators import desc
    from r2.lib.utils import base_url, progress

    # from link.py
    def old_by_url_key(url):
        prefix = 'byurl_'
        s = _force_utf8(base_url(url.lower()))
        return '%s%s' % (prefix, s)

    def new_by_url_key(url):
        maxlen = 250
        template = 'byurl(%s,%s)'
        keyurl = _force_utf8(base_url(url.lower()))
        hexdigest = md5(keyurl).hexdigest()
        usable_len = maxlen - len(template) - len(hexdigest)
        return template % (hexdigest, keyurl[:usable_len])

    verbosity = 1000

    l_q = Link._query(Link.c._spam == (True, False),
                      data=True,
                      sort=desc('_date'))
    for links in (in_chunks(
            progress(
                fetch_things2(l_q, verbosity),
                key=lambda link: link._date,
                verbosity=verbosity,
                estimate=int(9.9e6),
                persec=True,
            ), verbosity)):
        # only links with actual URLs
        links = filter(
            lambda link:
            (not getattr(link, 'is_self', False) and getattr(link, 'url', '')),
            links)

        # old key -> new key
        translate = dict((old_by_url_key(link.url), new_by_url_key(link.url))
                         for link in links)

        old = g.permacache.get_multi(translate.keys())
        new = dict((translate[old_key], value)
                   for (old_key, value) in old.iteritems())
        g.permacache.set_multi(new)
Exemple #11
0
def _generate_sitemaps(links, set_lastmod=True):
    """Create an iterator of sitemaps.

    Each sitemap has up to 50000 links, being the maximum allowable number of
    links according to the sitemap standard.
    """
    for subreddit_chunks in in_chunks(links, LINKS_PER_SITEMAP):
        yield generate_sitemap_from_links(subreddit_chunks,
                                          set_lastmod=set_lastmod)
def update_activity():
    events = {}
    event_counts = collections.Counter()

    query = (ev for ev in LiveUpdateEvent._all()
             if ev.state == "live" and not ev.banned)
    for chunk in utils.in_chunks(query, size=100):
        context_ids = {"LiveUpdateEvent_" + ev._id: ev._id for ev in chunk}

        try:
            with c.activity_service.retrying(attempts=4) as svc:
                infos = svc.count_activity_multi(context_ids.keys())
        except TTransportException:
            continue

        for context_id, info in infos.iteritems():
            event_id = context_ids[context_id]

            try:
                LiveUpdateActivityHistoryByEvent.record_activity(
                    event_id, info.count)
            except tdb_cassandra.TRANSIENT_EXCEPTIONS as e:
                g.log.warning("Failed to update activity history for %r: %s",
                              event_id, e)

            try:
                event = LiveUpdateEvent.update_activity(
                    event_id, info.count, info.is_fuzzed)
            except tdb_cassandra.TRANSIENT_EXCEPTIONS as e:
                g.log.warning("Failed to update event activity for %r: %s",
                              event_id, e)
            else:
                events[event_id] = event
                event_counts[event_id] = info.count

            websockets.send_broadcast(
                "/live/" + event_id,
                type="activity",
                payload={
                    "count": info.count,
                    "fuzzed": info.is_fuzzed,
                },
            )

    top_event_ids = [
        event_id for event_id, count in event_counts.most_common(1000)
    ]
    top_events = [events[event_id] for event_id in top_event_ids]
    query_ttl = datetime.timedelta(days=3)
    with CachedQueryMutator() as m:
        m.replace(get_active_events(), top_events, ttl=query_ttl)

    # ensure that all the amqp messages we've put on the worker's queue are
    # sent before we allow this script to exit.
    amqp.worker.join()
Exemple #13
0
    def get_details(cls, thing, voters=None):
        from r2.models import Comment, Link
        if isinstance(thing, Link):
            details_cls = VoteDetailsByLink
        elif isinstance(thing, Comment):
            details_cls = VoteDetailsByComment
        else:
            raise ValueError

        voter_id36s = None
        if voters:
            voter_id36s = [voter._id36 for voter in voters]

        try:
            row = details_cls._byID(thing._id36, properties=voter_id36s)
            raw_details = row._values()
        except tdb_cassandra.NotFound:
            return []

        try:
            row = VoterIPByThing._byID(thing._fullname, properties=voter_id36s)
            ips = row._values()
        except tdb_cassandra.NotFound:
            ips = {}

        # look up all the accounts in batches of 100
        account_id36s = set(raw_details.keys())
        accounts = {}
        for id_chunk in in_chunks(account_id36s, size=100):
            accounts.update(Account._byID36(id_chunk, data=True))

        details = []
        for voter_id36, json_data in raw_details.iteritems():
            vote_data = json.loads(json_data)
            vote_data = cls.convert_old_details(vote_data)

            extra_data = vote_data["data"]
            extra_data["ip"] = ips.get(voter_id36)

            vote = Vote(
                user=accounts[voter_id36],
                thing=thing,
                direction=Vote.deserialize_direction(vote_data["direction"]),
                date=datetime.utcfromtimestamp(vote_data["date"]),
                data=extra_data,
                effects=vote_data["effects"],
                get_previous_vote=False,
            )

            details.append(vote)

        details.sort(key=lambda d: d.date)

        return details
Exemple #14
0
    def get_details(cls, thing, voters=None):
        from r2.models import Comment, Link
        if isinstance(thing, Link):
            details_cls = VoteDetailsByLink
        elif isinstance(thing, Comment):
            details_cls = VoteDetailsByComment
        else:
            raise ValueError

        voter_id36s = None
        if voters:
            voter_id36s = [voter._id36 for voter in voters]

        try:
            row = details_cls._byID(thing._id36, properties=voter_id36s)
            raw_details = row._values()
        except tdb_cassandra.NotFound:
            return []

        try:
            row = VoterIPByThing._byID(thing._fullname, properties=voter_id36s)
            ips = row._values()
        except tdb_cassandra.NotFound:
            ips = {}

        # look up all the accounts in batches of 100
        account_id36s = set(raw_details.keys())
        accounts = {}
        for id_chunk in in_chunks(account_id36s, size=100):
            accounts.update(Account._byID36(id_chunk, data=True))

        details = []
        for voter_id36, json_data in raw_details.iteritems():
            vote_data = json.loads(json_data)
            vote_data = cls.convert_old_details(vote_data)

            extra_data = vote_data["data"]
            extra_data["ip"] = ips.get(voter_id36)

            vote = Vote(
                user=accounts[voter_id36],
                thing=thing,
                direction=Vote.deserialize_direction(vote_data["direction"]),
                date=datetime.utcfromtimestamp(vote_data["date"]),
                data=extra_data,
                effects=vote_data["effects"],
                get_previous_vote=False,
            )

            details.append(vote)

        details.sort(key=lambda d: d.date)

        return details
Exemple #15
0
 def bulk_load(self, start='', end='', chunk_size=100):
     """Try to load everything out of Cassandra and put it into
        memcached"""
     cf = self.cassa.cf
     for rows in in_chunks(
             cf.get_range(start=start, finish=end, columns=['value']),
             chunk_size):
         print rows[0][0]
         rows = dict((key, pickle.loads(cols['value']))
                     for (key, cols) in rows if (cols
                                                 # hack
                                                 and len(key) < 250))
         self.memcache.set_multi(rows)
def rebuild_index(after_id = None):
    cls = Link

    # don't pull spam/deleted
    q = cls._query(sort=desc('_date'), data=True)

    if after_id:
        q._after(cls._byID(after_id))

    q = fetch_things2(q)

    q = progress(q, verbosity=1000, estimate=10000000, persec=True)
    for chunk in in_chunks(q):
        inject(chunk)
Exemple #17
0
    def simple_get_multi(self, keys):
        results = {}
        category_bundles = {}
        for key in keys:
            category, ids = self._split_key(key)
            category_bundles.setdefault(category, []).append(ids)

        for category in category_bundles:
            idses = category_bundles[category]
            chunks = in_chunks(idses, size=50)
            for chunk in chunks:
                new_results = self.backend.get_multi(category, chunk)
                results.update(new_results)

        return results
Exemple #18
0
def give_trophies(users):
    for fullnames in in_chunks(progress(users, verbosity=50), size=50):
        users = Account._by_fullname(fullnames, return_dict=False)

        for user in users:
            team = get_user_team(user)

            trophy = Award.give_if_needed(
                codename="f2p_orangered" if team == "red" else "f2p_periwinkle",
                user=user,
            )
            if trophy:
                trophy._commit()

        sleep(.5)
Exemple #19
0
    def simple_get_multi(self, keys):
        results = {}
        category_bundles = {}
        for key in keys:
            category, ids = self._split_key(key)
            category_bundles.setdefault(category, []).append(ids)

        for category in category_bundles:
            idses = category_bundles[category]
            chunks = in_chunks(idses, size=50)
            for chunk in chunks:
                new_results = self.backend.get_multi(category, chunk)
                results.update(new_results)

        return results
Exemple #20
0
def get_recent_name_submissions():
    link_fullnames = list(queries.get_links(SERVERNAME_SR, "new", "all"))
    links = chain.from_iterable(Thing._by_fullname(chunk, return_dict=False)
                                for chunk in in_chunks(link_fullnames))

    for link in links:
        if link._deleted or link._spam:
            continue

        # OH GOD WHAT HAVE YOU POSTED IN MY LOVELY AUTOMATED SUBREDDIT!?
        if (not hasattr(link, "revenue_date") or
            not hasattr(link, "revenue_bucket") or
            not hasattr(link, "server_names")):
            continue

        yield link
Exemple #21
0
def port_deleted_links(after_id=None):
    from r2.models import Link
    from r2.lib.db.operators import desc
    from r2.models.query_cache import CachedQueryMutator
    from r2.lib.db.queries import get_deleted_links
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc("_date"), data=True)
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, verbosity=1000)

    for chunk in in_chunks(q):
        with CachedQueryMutator() as m:
            for link in chunk:
                query = get_deleted_links(link.author_id)
                m.insert(query, [link])
Exemple #22
0
 def bulk_load(self, start='', end='', chunk_size = 100):
     """Try to load everything out of Cassandra and put it into
        memcached"""
     cf = self.cassa.cf
     for rows in in_chunks(cf.get_range(start=start,
                                        finish=end,
                                        columns=['value']),
                           chunk_size):
         print rows[0][0]
         rows = dict((key, pickle.loads(cols['value']))
                     for (key, cols)
                     in rows
                     if (cols
                         # hack
                         and len(key) < 250))
         self.memcache.set_multi(rows)
Exemple #23
0
def rebuild_link_index(start_at=None,
                       sleeptime=1,
                       cls=Link,
                       uploader=LinkUploader,
                       doc_api='CLOUDSEARCH_DOC_API',
                       estimate=50000000,
                       chunk_size=1000):
    cache_key = _REBUILD_INDEX_CACHE_KEY % uploader.__name__.lower()
    doc_api = getattr(g, doc_api)
    uploader = uploader(doc_api)

    if start_at is _REBUILD_INDEX_CACHE_KEY:
        start_at = g.cache.get(cache_key)
        if not start_at:
            raise ValueError("Told me to use '%s' key, but it's not set" %
                             cache_key)

    q = cls._query(cls.c._deleted == (True, False),
                   sort=desc('_date'),
                   data=True)
    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)
    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q,
                         verbosity=1000,
                         estimate=estimate,
                         persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        #uploader.things = chunk
        uploader.fullnames = [link._fullname for link in chunk]
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        g.cache.set(cache_key, last_update._fullname)
        time.sleep(sleeptime)
Exemple #24
0
def port_deleted_links(after_id=None):
    from r2.models import Link
    from r2.lib.db.operators import desc
    from r2.models.query_cache import CachedQueryMutator
    from r2.lib.db.queries import get_deleted_links
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._deleted == True,
                    Link.c._spam == (True, False),
                    sort=desc('_date'), data=True)
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, verbosity=1000)

    for chunk in in_chunks(q):
        with CachedQueryMutator() as m:
            for link in chunk:
                query = get_deleted_links(link.author_id)
                m.insert(query, [link])
Exemple #25
0
def rebuild_index(after_id = None, estimate=10000000):
    cls = Link

    # don't pull spam/deleted
    q = cls._query(sort=desc('_date'), data=True)

    if after_id:
        q._after(cls._byID(after_id))

    q = fetch_things2(q)

    def key(link):
        # we're going back in time, so this will give us a good idea
        # of how far we've gone
        return "%s/%s" % (link._id, link._date)

    q = progress(q, verbosity=1000, estimate=estimate, persec=True, key=key)
    for chunk in in_chunks(q):
        inject(chunk)
Exemple #26
0
def rebuild_index(after_id=None, estimate=10000000):
    cls = Link

    # don't pull spam/deleted
    q = cls._query(sort=desc('_date'), data=True)

    if after_id:
        q._after(cls._byID(after_id))

    q = fetch_things2(q)

    def key(link):
        # we're going back in time, so this will give us a good idea
        # of how far we've gone
        return "%s/%s" % (link._id, link._date)

    q = progress(q, verbosity=1000, estimate=estimate, persec=True, key=key)
    for chunk in in_chunks(q):
        inject(chunk)
def backfill_vote_details(cls):
    ninety_days = timedelta(days=90).total_seconds()
    for chunk in in_chunks(cls._all(), size=100):
        detail_chunk = defaultdict(dict)
        try:
            with VoterIPByThing._cf.batch(
                    write_consistency_level=cls._write_consistency_level) as b:
                for vote_list in chunk:
                    thing_id36 = vote_list._id
                    thing_fullname = vote_list.votee_fullname
                    details = vote_list.decode_details()
                    for detail in details:
                        voter_id36 = detail["voter_id"]
                        if "ip" in detail and detail["ip"]:
                            ip = detail["ip"]
                            redacted = dict(detail)
                            del redacted["ip"]
                            cast = detail["date"]
                            now = epoch_seconds(
                                datetime.utcnow().replace(tzinfo=g.tz))
                            ttl = ninety_days - (now - cast)
                            oneweek = ""
                            if ttl < 3600 * 24 * 7:
                                oneweek = "(<= one week left)"
                            print "Inserting %s with IP ttl %d %s" % (
                                redacted, ttl, oneweek)
                            detail_chunk[thing_id36][voter_id36] = json.dumps(
                                redacted)
                            if ttl <= 0:
                                print "Skipping bogus ttl for %s: %d" % (
                                    redacted, ttl)
                                continue
                            b.insert(thing_fullname, {voter_id36: ip}, ttl=ttl)
        except Exception:
            # Getting some really weird spurious errors here; complaints about negative
            # TTLs even though they can't possibly be negative, errors from cass
            # that have an explanation of "(why=')"
            # Just going to brute-force this through.  We might lose 100 here and there
            # but mostly it'll be intact.
            pass
        for votee_id36, valuedict in detail_chunk.iteritems():
            cls._set_values(votee_id36, valuedict)
Exemple #28
0
def _populate(after_id=None, estimate=54301242):
    from r2.models import desc
    from r2.lib.db import tdb_cassandra
    from r2.lib import utils

    # larger has a chance to decrease the number of Cassandra writes,
    # but the probability is low
    chunk_size = 5000

    q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc("_date"))

    if after_id is not None:
        q._after(Comment._byID(after_id))

    q = utils.fetch_things2(q, chunk_size=chunk_size)
    q = utils.progress(q, verbosity=chunk_size, estimate=estimate)

    for chunk in utils.in_chunks(q, chunk_size):
        chunk = filter(lambda x: hasattr(x, "link_id"), chunk)
        update_comment_votes(chunk)
Exemple #29
0
def rebuild_link_index(
    start_at=None,
    sleeptime=1,
    cls=Link,
    uploader=LinkUploader,
    doc_api="CLOUDSEARCH_DOC_API",
    estimate=50000000,
    chunk_size=1000,
):
    cache_key = _REBUILD_INDEX_CACHE_KEY % uploader.__name__.lower()
    doc_api = getattr(g, doc_api)
    uploader = uploader(doc_api)

    if start_at is _REBUILD_INDEX_CACHE_KEY:
        start_at = g.cache.get(cache_key)
        if not start_at:
            raise ValueError("Told me to use '%s' key, but it's not set" % cache_key)

    q = cls._query(cls.c._deleted == (True, False), sort=desc("_date"), data=True)
    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)
    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        uploader.things = chunk
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        g.cache.set(cache_key, last_update._fullname)
        time.sleep(sleeptime)
Exemple #30
0
def port_cassaurls(after_id=None, estimate=15231317):
    from r2.models import Link, LinksByUrl
    from r2.lib.db import tdb_cassandra
    from r2.lib.db.operators import desc
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._spam == (True, False), sort=desc("_date"), data=True)
    if after_id:
        q._after(Link._byID(after_id, data=True))
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, estimate=estimate)
    q = (l for l in q if getattr(l, "url", "self") != "self" and not getattr(l, "is_self", False))
    chunks = in_chunks(q, 500)

    for chunk in chunks:
        with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b:
            for l in chunk:
                k = LinksByUrl._key_from_url(l.url)
                if k:
                    b.insert(k, {l._id36: l._id36})
Exemple #31
0
def port_cassaurls(after_id=None, estimate=15231317):
    from r2.models import Link, LinksByUrlAndSubreddit
    from r2.lib.db import tdb_cassandra
    from r2.lib.db.operators import desc
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._spam == (True, False),
                    sort=desc('_date'), data=True)
    if after_id:
        q._after(Link._byID(after_id,data=True))
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, estimate=estimate)
    q = (l for l in q
         if getattr(l, 'url', 'self') != 'self'
         and not getattr(l, 'is_self', False))
    chunks = in_chunks(q, 500)

    for chunk in chunks:
        for l in chunk:
            LinksByUrlAndSubreddit.add_link(l)
def port_cassaurls(after_id=None, estimate=15231317):
    from r2.models import Link, LinksByUrlAndSubreddit
    from r2.lib.db import tdb_cassandra
    from r2.lib.db.operators import desc
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    if after_id:
        q._after(Link._byID(after_id, data=True))
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, estimate=estimate)
    q = (l for l in q if getattr(l, 'url', 'self') != 'self'
         and not getattr(l, 'is_self', False))
    chunks = in_chunks(q, 500)

    for chunk in chunks:
        for l in chunk:
            LinksByUrlAndSubreddit.add_link(l)
Exemple #33
0
def _location_by_ips(ips):
    if not hasattr(g, 'geoip_location'):
        g.log.warning("g.geoip_location not set. skipping GeoIP lookup.")
        return {}

    ret = {}
    for batch in in_chunks(ips, MAX_IPS_PER_GROUP):
        ip_string = '+'.join(batch)
        url = os.path.join(g.geoip_location, 'geoip', ip_string)

        try:
            response = urllib2.urlopen(url=url, timeout=3)
            json_data = response.read()
        except (urllib2.URLError, httplib.HTTPException, socket.error) as e:
            g.log.warning("Failed to fetch GeoIP information: %r" % e)
            continue

        try:
            ret.update(json.loads(json_data))
        except ValueError, e:
            g.log.warning("Invalid JSON response for GeoIP lookup: %r" % e)
            continue
Exemple #34
0
def rebuild_index(start_at=None,
                  sleeptime=1,
                  cls=Link,
                  estimate=50000000,
                  chunk_size=1000):
    if start_at is _REBUILD_INDEX_CACHE_KEY:
        start_at = g.cache.get(start_at)
        if not start_at:
            raise ValueError("Told me to use '%s' key, but it's not set" %
                             _REBUILD_INDEX_CACHE_KEY)

    q = cls._query(cls.c._deleted == (True, False),
                   sort=desc('_date'),
                   data=True)
    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)
    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q,
                         verbosity=1000,
                         estimate=estimate,
                         persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        for x in range(5):
            try:
                inject(chunk)
            except httplib.HTTPException as err:
                print "Got  %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        g.cache.set(_REBUILD_INDEX_CACHE_KEY, last_update._fullname)
        time.sleep(sleeptime)
Exemple #35
0
def _location_by_ips(ips):
    if not hasattr(g, 'geoip_location'):
        g.log.warning("g.geoip_location not set. skipping GeoIP lookup.")
        return {}

    ret = {}
    for batch in in_chunks(ips, MAX_IPS_PER_GROUP):
        ip_string = '+'.join(batch)
        url = os.path.join(g.geoip_location, 'geoip', ip_string)

        try:
            response = urllib2.urlopen(url=url, timeout=3)
            json_data = response.read()
        except (urllib2.URLError, httplib.HTTPException, socket.error) as e:
            g.log.warning("Failed to fetch GeoIP information: %r" % e)
            continue

        try:
            ret.update(json.loads(json_data))
        except ValueError, e:
            g.log.warning("Invalid JSON response for GeoIP lookup: %r" % e)
            continue
Exemple #36
0
def _populate(after_id=None, estimate=54301242):
    from r2.models import desc
    from r2.lib.db import tdb_cassandra
    from r2.lib import utils

    # larger has a chance to decrease the number of Cassandra writes,
    # but the probability is low
    chunk_size = 5000

    q = Comment._query(Comment.c._spam == (True, False),
                       Comment.c._deleted == (True, False),
                       sort=desc('_date'))

    if after_id is not None:
        q._after(Comment._byID(after_id))

    q = utils.fetch_things2(q, chunk_size=chunk_size)
    q = utils.progress(q, verbosity=chunk_size, estimate=estimate)

    for chunk in utils.in_chunks(q, chunk_size):
        chunk = filter(lambda x: hasattr(x, 'link_id'), chunk)
        update_comment_votes(chunk)
def backfill_vote_details(cls):
    ninety_days = timedelta(days=90).total_seconds()
    for chunk in in_chunks(cls._all(), size=100):
        detail_chunk = defaultdict(dict)
        try:
            with VoterIPByThing._cf.batch(write_consistency_level=cls._write_consistency_level) as b:
                for vote_list in chunk:
                    thing_id36 = vote_list._id
                    thing_fullname = vote_list.votee_fullname
                    details = vote_list.decode_details()
                    for detail in details:
                        voter_id36 = detail["voter_id"]
                        if "ip" in detail and detail["ip"]:
                            ip = detail["ip"]
                            redacted = dict(detail)
                            del redacted["ip"]
                            cast = detail["date"]
                            now = epoch_seconds(datetime.utcnow().replace(tzinfo=g.tz))
                            ttl = ninety_days - (now - cast)
                            oneweek = ""
                            if ttl < 3600 * 24 * 7:
                                oneweek = "(<= one week left)"
                            print "Inserting %s with IP ttl %d %s" % (redacted, ttl, oneweek)
                            detail_chunk[thing_id36][voter_id36] = json.dumps(redacted)
                            if ttl <= 0:
                                print "Skipping bogus ttl for %s: %d" % (redacted, ttl)
                                continue
                            b.insert(thing_fullname, {voter_id36: ip}, ttl=ttl)
        except Exception:
            # Getting some really weird spurious errors here; complaints about negative
            # TTLs even though they can't possibly be negative, errors from cass
            # that have an explanation of "(why=')"
            # Just going to brute-force this through.  We might lose 100 here and there
            # but mostly it'll be intact.
            pass
        for votee_id36, valuedict in detail_chunk.iteritems():
            cls._set_values(votee_id36, valuedict)
Exemple #38
0
def port_cassaurls(after_id=None, estimate=15231317):
    from r2.models import Link, LinksByUrl
    from r2.lib.db import tdb_cassandra
    from r2.lib.db.operators import desc
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    if after_id:
        q._after(Link._byID(after_id, data=True))
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, estimate=estimate)
    q = (l for l in q if getattr(l, 'url', 'self') != 'self'
         and not getattr(l, 'is_self', False))
    chunks = in_chunks(q, 500)

    for chunk in chunks:
        with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b:
            for l in chunk:
                k = LinksByUrl._key_from_url(l.url)
                if k:
                    b.insert(k, {l._id36: l._id36})
Exemple #39
0
def pushup_permacache(verbosity=1000):
    """When putting cassandra into the permacache chain, we need to
       push everything up into the rest of the chain, so this is
       everything that uses the permacache, as of that check-in."""
    from pylons import g
    from r2.models import Link, Subreddit, Account
    from r2.lib.db.operators import desc
    from r2.lib.comment_tree import comments_key, messages_key
    from r2.lib.utils import fetch_things2, in_chunks
    from r2.lib.utils import last_modified_key
    from r2.lib.promote import promoted_memo_key
    from r2.lib.subreddit_search import load_all_reddits
    from r2.lib.db import queries
    from r2.lib.cache import CassandraCacheChain

    authority = g.permacache.caches[-1]
    nonauthority = CassandraCacheChain(g.permacache.caches[1:-1])

    def populate(keys):
        vals = authority.simple_get_multi(keys)
        if vals:
            nonauthority.set_multi(vals)

    def gen_keys():
        yield promoted_memo_key

        # just let this one do its own writing
        load_all_reddits()

        yield queries.get_all_comments().iden

        l_q = Link._query(Link.c._spam == (True, False),
                          Link.c._deleted == (True, False),
                          sort=desc('_date'),
                          data=True,
                          )
        for link in fetch_things2(l_q, verbosity):
            yield comments_key(link._id)
            yield last_modified_key(link, 'comments')

        a_q = Account._query(Account.c._spam == (True, False),
                             sort=desc('_date'),
                             )
        for account in fetch_things2(a_q, verbosity):
            yield messages_key(account._id)
            yield last_modified_key(account, 'overview')
            yield last_modified_key(account, 'commented')
            yield last_modified_key(account, 'submitted')
            yield last_modified_key(account, 'liked')
            yield last_modified_key(account, 'disliked')
            yield queries.get_comments(account, 'new', 'all').iden
            yield queries.get_submitted(account, 'new', 'all').iden
            yield queries.get_liked(account).iden
            yield queries.get_disliked(account).iden
            yield queries.get_hidden(account).iden
            yield queries.get_saved(account).iden
            yield queries.get_inbox_messages(account).iden
            yield queries.get_unread_messages(account).iden
            yield queries.get_inbox_comments(account).iden
            yield queries.get_unread_comments(account).iden
            yield queries.get_inbox_selfreply(account).iden
            yield queries.get_unread_selfreply(account).iden
            yield queries.get_sent(account).iden

        sr_q = Subreddit._query(Subreddit.c._spam == (True, False),
                                sort=desc('_date'),
                                )
        for sr in fetch_things2(sr_q, verbosity):
            yield last_modified_key(sr, 'stylesheet_contents')
            yield queries.get_links(sr, 'hot', 'all').iden
            yield queries.get_links(sr, 'new', 'all').iden

            for sort in 'top', 'controversial':
                for time in 'hour', 'day', 'week', 'month', 'year', 'all':
                    yield queries.get_links(sr, sort, time,
                                            merge_batched=False).iden
            yield queries.get_spam_links(sr).iden
            yield queries.get_spam_comments(sr).iden
            yield queries.get_reported_links(sr).iden
            yield queries.get_reported_comments(sr).iden
            yield queries.get_subreddit_messages(sr).iden
            yield queries.get_unread_subreddit_messages(sr).iden

    done = 0
    for keys in in_chunks(gen_keys(), verbosity):
        g.reset_caches()
        done += len(keys)
        print 'Done %d: %r' % (done, keys[-1])
        populate(keys)
Exemple #40
0
def shorten_byurl_keys():
    """We changed by_url keys from a format like
           byurl_google.com...
       to:
           byurl(1d5920f4b44b27a802bd77c4f0536f5a, google.com...)
       so that they would fit in memcache's 251-char limit
    """

    from datetime import datetime
    from hashlib import md5
    from r2.models import Link
    from r2.lib.filters import _force_utf8
    from pylons import g
    from r2.lib.utils import fetch_things2, in_chunks
    from r2.lib.db.operators import desc
    from r2.lib.utils import base_url, progress

    # from link.py
    def old_by_url_key(url):
        prefix='byurl_'
        s = _force_utf8(base_url(url.lower()))
        return '%s%s' % (prefix, s)
    def new_by_url_key(url):
        maxlen = 250
        template = 'byurl(%s,%s)'
        keyurl = _force_utf8(base_url(url.lower()))
        hexdigest = md5(keyurl).hexdigest()
        usable_len = maxlen-len(template)-len(hexdigest)
        return template % (hexdigest, keyurl[:usable_len])

    verbosity = 1000

    l_q = Link._query(
        Link.c._spam == (True, False),
        data=True,
        sort=desc('_date'))
    for links in (
        in_chunks(
            progress(
                fetch_things2(l_q, verbosity),
                key = lambda link: link._date,
                verbosity=verbosity,
                estimate=int(9.9e6),
                persec=True,
                ),
            verbosity)):
        # only links with actual URLs
        links = filter(lambda link: (not getattr(link, 'is_self', False)
                                     and getattr(link, 'url', '')),
                       links)

        # old key -> new key
        translate = dict((old_by_url_key(link.url),
                          new_by_url_key(link.url))
                         for link in links)

        old = g.permacache.get_multi(translate.keys())
        new = dict((translate[old_key], value)
                   for (old_key, value)
                   in old.iteritems())
        g.permacache.set_multi(new)
def pushup_permacache(verbosity=1000):
    """When putting cassandra into the permacache chain, we need to
       push everything up into the rest of the chain, so this is
       everything that uses the permacache, as of that check-in."""
    from pylons import app_globals as g
    from r2.models import Link, Subreddit, Account
    from r2.lib.db.operators import desc
    from r2.lib.comment_tree import comments_key, messages_key
    from r2.lib.utils import fetch_things2, in_chunks
    from r2.lib.utils import last_modified_key
    from r2.lib.promote import promoted_memo_key
    from r2.lib.subreddit_search import load_all_reddits
    from r2.lib.db import queries
    from r2.lib.cache import CassandraCacheChain

    authority = g.permacache.caches[-1]
    nonauthority = CassandraCacheChain(g.permacache.caches[1:-1])

    def populate(keys):
        vals = authority.simple_get_multi(keys)
        if vals:
            nonauthority.set_multi(vals)

    def gen_keys():
        yield promoted_memo_key

        # just let this one do its own writing
        load_all_reddits()

        yield queries.get_all_comments().iden

        l_q = Link._query(
            Link.c._spam == (True, False),
            Link.c._deleted == (True, False),
            sort=desc('_date'),
            data=True,
        )
        for link in fetch_things2(l_q, verbosity):
            yield comments_key(link._id)
            yield last_modified_key(link, 'comments')

        a_q = Account._query(
            Account.c._spam == (True, False),
            sort=desc('_date'),
        )
        for account in fetch_things2(a_q, verbosity):
            yield messages_key(account._id)
            yield last_modified_key(account, 'overview')
            yield last_modified_key(account, 'commented')
            yield last_modified_key(account, 'submitted')
            yield last_modified_key(account, 'liked')
            yield last_modified_key(account, 'disliked')
            yield queries.get_comments(account, 'new', 'all').iden
            yield queries.get_submitted(account, 'new', 'all').iden
            yield queries.get_liked(account).iden
            yield queries.get_disliked(account).iden
            yield queries.get_hidden(account).iden
            yield queries.get_saved(account).iden
            yield queries.get_inbox_messages(account).iden
            yield queries.get_unread_messages(account).iden
            yield queries.get_inbox_comments(account).iden
            yield queries.get_unread_comments(account).iden
            yield queries.get_inbox_selfreply(account).iden
            yield queries.get_unread_selfreply(account).iden
            yield queries.get_sent(account).iden

        sr_q = Subreddit._query(
            Subreddit.c._spam == (True, False),
            sort=desc('_date'),
        )
        for sr in fetch_things2(sr_q, verbosity):
            yield last_modified_key(sr, 'stylesheet_contents')
            yield queries.get_links(sr, 'hot', 'all').iden
            yield queries.get_links(sr, 'new', 'all').iden

            for sort in 'top', 'controversial':
                for time in 'hour', 'day', 'week', 'month', 'year', 'all':
                    yield queries.get_links(sr,
                                            sort,
                                            time,
                                            merge_batched=False).iden
            yield queries.get_spam_links(sr).iden
            yield queries.get_spam_comments(sr).iden
            yield queries.get_reported_links(sr).iden
            yield queries.get_reported_comments(sr).iden
            yield queries.get_subreddit_messages(sr).iden
            yield queries.get_unread_subreddit_messages(sr).iden

    done = 0
    for keys in in_chunks(gen_keys(), verbosity):
        g.reset_caches()
        done += len(keys)
        print 'Done %d: %r' % (done, keys[-1])
        populate(keys)