Exemple #1
0
def upgrade_messages(update_comments=True, update_messages=True, update_trees=True):
    from r2.lib.db import queries
    from r2.lib import comment_tree, cache
    from r2.models import Account
    from pylons import app_globals as g

    accounts = set()

    def batch_fn(items):
        g.reset_caches()
        return items

    if update_messages or update_trees:
        q = Message._query(Message.c.new == True, sort=desc("_date"), data=True)
        for m in fetch_things2(q, batch_fn=batch_fn):
            print m, m._date
            if update_messages:
                accounts = accounts | queries.set_unread(m, m.new)
            else:
                accounts.add(m.to_id)
    if update_comments:
        q = Comment._query(Comment.c.new == True, sort=desc("_date"))
        q._filter(Comment.c._id < 26152162676)

        for m in fetch_things2(q, batch_fn=batch_fn):
            print m, m._date
            queries.set_unread(m, True)

    print "Precomputing comment trees for %d accounts" % len(accounts)

    for i, a in enumerate(accounts):
        if not isinstance(a, Account):
            a = Account._byID(a)
        print i, a
        comment_tree.user_messages(a)
Exemple #2
0
    def gen_keys():
        yield promoted_memo_key

        # just let this one do its own writing
        load_all_reddits()

        yield queries.get_all_comments().iden

        l_q = Link._query(Link.c._spam == (True, False),
                          Link.c._deleted == (True, False),
                          sort=desc('_date'),
                          data=True,
                          )
        for link in fetch_things2(l_q, verbosity):
            yield comments_key(link._id)
            yield last_modified_key(link, 'comments')

        a_q = Account._query(Account.c._spam == (True, False),
                             sort=desc('_date'),
                             )
        for account in fetch_things2(a_q, verbosity):
            yield messages_key(account._id)
            yield last_modified_key(account, 'overview')
            yield last_modified_key(account, 'commented')
            yield last_modified_key(account, 'submitted')
            yield last_modified_key(account, 'liked')
            yield last_modified_key(account, 'disliked')
            yield queries.get_comments(account, 'new', 'all').iden
            yield queries.get_submitted(account, 'new', 'all').iden
            yield queries.get_liked(account).iden
            yield queries.get_disliked(account).iden
            yield queries.get_hidden(account).iden
            yield queries.get_saved(account).iden
            yield queries.get_inbox_messages(account).iden
            yield queries.get_unread_messages(account).iden
            yield queries.get_inbox_comments(account).iden
            yield queries.get_unread_comments(account).iden
            yield queries.get_inbox_selfreply(account).iden
            yield queries.get_unread_selfreply(account).iden
            yield queries.get_sent(account).iden

        sr_q = Subreddit._query(Subreddit.c._spam == (True, False),
                                sort=desc('_date'),
                                )
        for sr in fetch_things2(sr_q, verbosity):
            yield last_modified_key(sr, 'stylesheet_contents')
            yield queries.get_links(sr, 'hot', 'all').iden
            yield queries.get_links(sr, 'new', 'all').iden

            for sort in 'top', 'controversial':
                for time in 'hour', 'day', 'week', 'month', 'year', 'all':
                    yield queries.get_links(sr, sort, time,
                                            merge_batched=False).iden
            yield queries.get_spam_links(sr).iden
            yield queries.get_spam_comments(sr).iden
            yield queries.get_reported_links(sr).iden
            yield queries.get_reported_comments(sr).iden
            yield queries.get_subreddit_messages(sr).iden
            yield queries.get_unread_subreddit_messages(sr).iden
Exemple #3
0
    def gen_keys():
        yield promoted_memo_key

        # just let this one do its own writing
        load_all_reddits()

        yield queries.get_all_comments().iden

        l_q = Link._query(Link.c._spam == (True, False),
                          Link.c._deleted == (True, False),
                          sort=desc('_date'),
                          data=True,
                          )
        for link in fetch_things2(l_q, verbosity):
            yield comments_key(link._id)
            yield last_modified_key(link, 'comments')

        a_q = Account._query(Account.c._spam == (True, False),
                             sort=desc('_date'),
                             )
        for account in fetch_things2(a_q, verbosity):
            yield messages_key(account._id)
            yield last_modified_key(account, 'overview')
            yield last_modified_key(account, 'commented')
            yield last_modified_key(account, 'submitted')
            yield last_modified_key(account, 'liked')
            yield last_modified_key(account, 'disliked')
            yield queries.get_comments(account, 'new', 'all').iden
            yield queries.get_submitted(account, 'new', 'all').iden
            yield queries.get_liked(account).iden
            yield queries.get_disliked(account).iden
            yield queries.get_hidden(account).iden
            yield queries.get_saved(account).iden
            yield queries.get_inbox_messages(account).iden
            yield queries.get_unread_messages(account).iden
            yield queries.get_inbox_comments(account).iden
            yield queries.get_unread_comments(account).iden
            yield queries.get_inbox_selfreply(account).iden
            yield queries.get_unread_selfreply(account).iden
            yield queries.get_sent(account).iden

        sr_q = Subreddit._query(Subreddit.c._spam == (True, False),
                                sort=desc('_date'),
                                )
        for sr in fetch_things2(sr_q, verbosity):
            yield last_modified_key(sr, 'stylesheet_contents')
            yield queries.get_links(sr, 'hot', 'all').iden
            yield queries.get_links(sr, 'new', 'all').iden

            for sort in 'top', 'controversial':
                for time in 'hour', 'day', 'week', 'month', 'year', 'all':
                    yield queries.get_links(sr, sort, time,
                                            merge_batched=False).iden
            yield queries.get_spam_links(sr).iden
            yield queries.get_spam_comments(sr).iden
            yield queries.get_reported_links(sr).iden
            yield queries.get_reported_comments(sr).iden
            yield queries.get_subreddit_messages(sr).iden
            yield queries.get_unread_subreddit_messages(sr).iden
Exemple #4
0
    def gen_keys():
        yield promoted_memo_key

        # just let this one do its own writing
        load_all_reddits()

        yield queries.get_all_comments().iden

        l_q = Link._query(
            Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc("_date"), data=True
        )
        for link in fetch_things2(l_q, verbosity):
            yield comments_key(link._id)
            yield last_modified_key(link, "comments")

        a_q = Account._query(Account.c._spam == (True, False), sort=desc("_date"))
        for account in fetch_things2(a_q, verbosity):
            yield messages_key(account._id)
            yield last_modified_key(account, "overview")
            yield last_modified_key(account, "commented")
            yield last_modified_key(account, "submitted")
            yield last_modified_key(account, "liked")
            yield last_modified_key(account, "disliked")
            yield queries.get_comments(account, "new", "all").iden
            yield queries.get_submitted(account, "new", "all").iden
            yield queries.get_liked(account).iden
            yield queries.get_disliked(account).iden
            yield queries.get_hidden(account).iden
            yield queries.get_saved(account).iden
            yield queries.get_inbox_messages(account).iden
            yield queries.get_unread_messages(account).iden
            yield queries.get_inbox_comments(account).iden
            yield queries.get_unread_comments(account).iden
            yield queries.get_inbox_selfreply(account).iden
            yield queries.get_unread_selfreply(account).iden
            yield queries.get_sent(account).iden

        sr_q = Subreddit._query(Subreddit.c._spam == (True, False), sort=desc("_date"))
        for sr in fetch_things2(sr_q, verbosity):
            yield last_modified_key(sr, "stylesheet_contents")
            yield queries.get_links(sr, "hot", "all").iden
            yield queries.get_links(sr, "new", "all").iden

            for sort in "top", "controversial":
                for time in "hour", "day", "week", "month", "year", "all":
                    yield queries.get_links(sr, sort, time, merge_batched=False).iden
            yield queries.get_spam_links(sr).iden
            yield queries.get_spam_comments(sr).iden
            yield queries.get_reported_links(sr).iden
            yield queries.get_reported_comments(sr).iden
            yield queries.get_subreddit_messages(sr).iden
            yield queries.get_unread_subreddit_messages(sr).iden
Exemple #5
0
def add_all_srs():
    """Adds every listing query for every subreddit to the queue."""
    q = Subreddit._query(sort = asc('_date'))
    for sr in fetch_things2(q):
        add_queries(all_queries(get_links, sr, ('hot', 'new', 'old'), ['all']))
        add_queries(all_queries(get_links, sr, ('top', 'controversial'), db_times.keys()))
        add_queries([get_links(sr, 'toplinks', 'all')])
def reset_last_email_sent_at_for_all_accounts():
    start_of_epoc = pytz.utc.localize(datetime.datetime.utcfromtimestamp(0))

    accounts = fetch_things2(Account._query(Account.c.email != None, sort=asc('_date')))
    for account in accounts:
        account.last_email_sent_at = start_of_epoc
        account._commit()
Exemple #7
0
def rebuild_index(start_at=None, sleeptime=1, cls=Link, estimate=50000000,
                  chunk_size=1000):
    if start_at is _REBUILD_INDEX_CACHE_KEY:
        start_at = g.cache.get(start_at)
        if not start_at:
            raise ValueError("Told me to use '%s' key, but it's not set" %
                             _REBUILD_INDEX_CACHE_KEY)
    
    q = cls._query(cls.c._deleted == (True, False),
                   sort=desc('_date'), data=True)
    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)
    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        for x in range(5):
            try:
                inject(chunk)
            except httplib.HTTPException as err:
                print "Got  %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        g.cache.set(_REBUILD_INDEX_CACHE_KEY, last_update._fullname)
        time.sleep(sleeptime)
Exemple #8
0
def port_cassavotes():
    from r2.models import Vote, Account, Link, Comment
    from r2.models.vote import CassandraVote, CassandraLinkVote, CassandraCommentVote
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, to36, progress

    ts = [(Vote.rel(Account, Link), CassandraLinkVote),
          (Vote.rel(Account, Comment), CassandraCommentVote)]

    dataattrs = set(['valid_user', 'valid_thing', 'ip', 'organic'])

    for prel, crel in ts:
        vq = prel._query(sort=desc('_date'),
                         data=True,
                         eager_load=False)
        vq = fetch_things2(vq)
        vq = progress(vq, persec=True)
        for v in vq:
            t1 = to36(v._thing1_id)
            t2 = to36(v._thing2_id)
            cv = crel(thing1_id = t1,
                      thing2_id = t2,
                      date=v._date,
                      name=v._name)
            for dkey, dval in v._t.iteritems():
                if dkey in dataattrs:
                    setattr(cv, dkey, dval)

            cv._commit(write_consistency_level=CL.ONE)
Exemple #9
0
def port_cassavotes():
    from r2.models import Vote, Account, Link, Comment
    from r2.models.vote import CassandraVote, CassandraLinkVote, CassandraCommentVote
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, to36, progress

    ts = [(Vote.rel(Account, Link), CassandraLinkVote),
          (Vote.rel(Account, Comment), CassandraCommentVote)]

    dataattrs = set(['valid_user', 'valid_thing', 'ip', 'organic'])

    for prel, crel in ts:
        vq = prel._query(sort=desc('_date'),
                         data=True,
                         eager_load=False)
        vq = fetch_things2(vq)
        vq = progress(vq, persec=True)
        for v in vq:
            t1 = to36(v._thing1_id)
            t2 = to36(v._thing2_id)
            cv = crel(thing1_id = t1,
                      thing2_id = t2,
                      date=v._date,
                      name=v._name)
            for dkey, dval in v._t.iteritems():
                if dkey in dataattrs:
                    setattr(cv, dkey, dval)

            cv._commit(write_consistency_level=CL.ONE)
Exemple #10
0
def process_new_links(period=media_period, force=False):
    """Fetches links from the last period and sets their media
    properities. If force is True, it will fetch properities for links
    even if the properties already exist"""
    links = Link._query(Link.c._date > timeago(period),
                        sort=desc('_date'),
                        data=True)
    results = {}
    jobs = []
    for link in fetch_things2(links):
        if link.is_self or link.promoted:
            continue
        elif not force and (link.has_thumbnail or link.media_object):
            continue

        jobs.append(make_link_info_job(results, link, g.useragent))

    #send links to a queue
    wq = WorkQueue(jobs, num_workers=20, timeout=30)
    wq.start()
    wq.jobs.join()

    #when the queue is finished, do the db writes in this thread
    for link, info in results.items():
        update_link(link, info[0], info[1])
Exemple #11
0
def add_all_srs():
    """Adds every listing query for every subreddit to the queue."""
    q = Subreddit._query(sort = asc('_date'))
    for sr in fetch_things2(q):
        add_queries(all_queries(get_links, sr, ('hot', 'new', 'old'), ['all']))
        add_queries(all_queries(get_links, sr, ('top', 'controversial'), db_times.keys()))
        add_queries([get_links(sr, 'toplinks', 'all')])
def test_send_summary_emails():
    accounts = fetch_things2(Account._query(Account.c.email != None, sort=asc('_date')))
    for account in accounts:
        a_day_ago = datetime.datetime.now(pytz.utc) - datetime.timedelta(hours=24)
        account.last_email_sent_at = a_day_ago
        account._commit()
        send_account_summary_email(account._id, verbose=True)
Exemple #13
0
def convert_old_media_objects():
    q = Link._query(Link.c.media_object is not None,
                    Link.c._date > whenever,
                    data = True)
    for link in utils.fetch_things2(q):
        if not getattr(link, 'media_object', None):
            continue

        if 'youtube' in link.media_object:
            # we can rewrite this one without scraping
            video_id = YoutubeScraper.video_id_rx.match(link.url)
            link.media_object = dict(type='youtube.com',
                                     video_id = video_id.group(1))
        elif ('video.google.com' in link.media_object
              or 'metacafe' in link.media_object):
            scraper = make_scraper(link.url)
            if not scraper:
                continue
            mo = scraper.media_object()
            if not mo:
                continue

            link.media_object = mo

        else:
            print "skipping %s because it confuses me" % link._fullname
            continue

        link._commit()
Exemple #14
0
def _rebuild_link_index(start_at=None, sleeptime=1, cls=Link,
                       uploader=SolrLinkUploader, estimate=50000000, 
                       chunk_size=1000):
    uploader = uploader()

    q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'))

    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)

    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        uploader.things = chunk
        uploader.fullnames = [c._fullname for c in chunk] 
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        print "last updated %s" % last_update._fullname
        time.sleep(sleeptime)
def populate_spam_filtered():
    from r2.lib.db.queries import get_spam_links, get_spam_comments
    from r2.lib.db.queries import get_spam_filtered_links, get_spam_filtered_comments
    from r2.models.query_cache import CachedQueryMutator

    def was_filtered(thing):
        if thing._spam and not thing._deleted and \
           getattr(thing, 'verdict', None) != 'mod-removed':
            return True
        else:
            return False

    q = Subreddit._query(sort=asc('_date'))
    for sr in fetch_things2(q):
        print 'Processing %s' % sr.name
        links = Thing._by_fullname(get_spam_links(sr),
                                   data=True,
                                   return_dict=False)
        comments = Thing._by_fullname(get_spam_comments(sr),
                                      data=True,
                                      return_dict=False)
        insert_links = [l for l in links if was_filtered(l)]
        insert_comments = [c for c in comments if was_filtered(c)]
        with CachedQueryMutator() as m:
            m.insert(get_spam_filtered_links(sr), insert_links)
            m.insert(get_spam_filtered_comments(sr), insert_comments)
Exemple #16
0
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link,
                       uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API',
                       estimate=50000000, chunk_size=1000):
    doc_api = getattr(g, doc_api)
    uploader = uploader(doc_api)

    q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'))

    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)

    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        uploader.things = chunk
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        print "last updated %s" % last_update._fullname
        time.sleep(sleeptime)
def backfill(after=None):
    q = Subreddit._query(sort=asc('_date'))
    if after:
        sr = Subreddit._by_name(after)
        q = q._after(sr)

    for sr in fetch_things2(q):
        backfill_sr(sr)
Exemple #18
0
def by_url_cache():
    q = Link._query(Link.c._spam == (True,False),
                    data = True,
                    sort = desc('_date'))
    for i, link in enumerate(fetch_things2(q)):
        if i % 100 == 0:
            print "%s..." % i
        link.set_url_cache()
Exemple #19
0
def backfill(after=None):
    q = Subreddit._query(sort=asc('_date'))
    if after:
        sr = Subreddit._by_name(after)
        q = q._after(sr)

    for sr in fetch_things2(q):
        backfill_sr(sr)
Exemple #20
0
def get_participated():
    users = {}

    q = Account._query(Account.c.f2p != "", sort=asc("_date"), data=True)
    for user in progress(fetch_things2(q)):
        users[user._fullname] = user.f2p

    return users
Exemple #21
0
def by_url_cache():
    q = Link._query(Link.c._spam == (True, False),
                    data=True,
                    sort=desc('_date'))
    for i, link in enumerate(fetch_things2(q)):
        if i % 100 == 0:
            print "%s..." % i
        link.set_url_cache()
Exemple #22
0
def shorten_byurl_keys():
    """We changed by_url keys from a format like
           byurl_google.com...
       to:
           byurl(1d5920f4b44b27a802bd77c4f0536f5a, google.com...)
       so that they would fit in memcache's 251-char limit
    """

    from datetime import datetime
    from hashlib import md5
    from r2.models import Link
    from r2.lib.filters import _force_utf8
    from pylons import g
    from r2.lib.utils import fetch_things2, in_chunks
    from r2.lib.db.operators import desc
    from r2.lib.utils import base_url, progress

    # from link.py
    def old_by_url_key(url):
        prefix = 'byurl_'
        s = _force_utf8(base_url(url.lower()))
        return '%s%s' % (prefix, s)

    def new_by_url_key(url):
        maxlen = 250
        template = 'byurl(%s,%s)'
        keyurl = _force_utf8(base_url(url.lower()))
        hexdigest = md5(keyurl).hexdigest()
        usable_len = maxlen - len(template) - len(hexdigest)
        return template % (hexdigest, keyurl[:usable_len])

    verbosity = 1000

    l_q = Link._query(Link.c._spam == (True, False),
                      data=True,
                      sort=desc('_date'))
    for links in (in_chunks(
            progress(
                fetch_things2(l_q, verbosity),
                key=lambda link: link._date,
                verbosity=verbosity,
                estimate=int(9.9e6),
                persec=True,
            ), verbosity)):
        # only links with actual URLs
        links = filter(
            lambda link:
            (not getattr(link, 'is_self', False) and getattr(link, 'url', '')),
            links)

        # old key -> new key
        translate = dict((old_by_url_key(link.url), new_by_url_key(link.url))
                         for link in links)

        old = g.permacache.get_multi(translate.keys())
        new = dict((translate[old_key], value)
                   for (old_key, value) in old.iteritems())
        g.permacache.set_multi(new)
def send_account_summary_email(account_thing_id, verbose=False, send_email=send_email):
    account = Account._byID(account_thing_id, data=True)
    if not should_send_activity_summary_email(account):
        return

    # if we've never sent an email, only tell about the last 24 hours
    a_day_ago = datetime.datetime.now(pytz.utc) - datetime.timedelta(hours=24)
    if getattr(account, 'last_email_sent_at', None) is None:
        account.last_email_sent_at = a_day_ago

    c.content_langs = 'en-US'

    # Find all the "active" links for this user.  Frontpage uses the c.user global
    # to find the right subreddits for the current user
    c.user = account
    c.user_is_loggedin = True
    thing_ids = []
    for link in Frontpage.get_links('active', 'all'):
        thing_ids.append(link)
    active_links_hash = Link._by_fullname(thing_ids, data=True)

    active_links = [active_links_hash[t_id] for t_id in thing_ids if active_links_hash[t_id]._active > account.last_email_sent_at]
    idx = 0
    for ll in active_links:
        idx += 1
        ll.num = idx 

    # Find all new spaces created since we last sent the user an email
    new_spaces = list(fetch_things2(Subreddit._query(
        Subreddit.c._date > account.last_email_sent_at,
        sort=asc('_date'))))

    # don't bother sending email if there's noting to report.
    if len(new_spaces) == 0 and len(active_links) == 0:
        return

    # Get the date and time
    now = datetime.datetime.now(pytz.timezone('US/Eastern'))
    date_string = now.strftime("%A %B %d, %Y")
    time_string = now.strftime("%I:%M %p")

    # Render the template
    html_email_template = g.mako_lookup.get_template('summary_email.html')
    html_body = html_email_template.render(
        last_email_sent_at=account.last_email_sent_at,
        new_spaces=new_spaces, 
        active_links=active_links,
        date_string=date_string,
        time_string=time_string)

    # with open('out.html', 'w') as ff:
    #     ff.write(html_body)
    if verbose:
        print "sending email to %s" % (account.email,)
    send_email(account.email, html_body, date_string)

    account.last_email_sent_at = datetime.datetime.now(pytz.utc)
    account._commit()
Exemple #24
0
def add_all_ban_report_srs():
    """Adds the initial spam/reported pages to the report queue"""
    q = Subreddit._query(sort = asc('_date'))
    for sr in fetch_things2(q):
        add_queries([get_spam_links(sr),
                     get_spam_comments(sr),
                     get_reported_links(sr),
                     get_reported_comments(sr),
                     ])
Exemple #25
0
def add_all_ban_report_srs():
    """Adds the initial spam/reported pages to the report queue"""
    q = Subreddit._query(sort = asc('_date'))
    for sr in fetch_things2(q):
        add_queries([get_spam_links(sr),
                     get_spam_comments(sr),
                     get_reported_links(sr),
                     get_reported_comments(sr),
                     ])
Exemple #26
0
def backfill_campaign_targets():
    from r2.lib.db.operators import desc
    from r2.lib.utils import fetch_things2

    q = PromoCampaign._query(sort=desc("_date"), data=True)
    for campaign in fetch_things2(q):
        sr_name = campaign.sr_name or Frontpage.name
        campaign.target = Target(sr_name)
        campaign._commit()
Exemple #27
0
def add_allow_top_to_srs():
    "Add the allow_top property to all stored subreddits"
    from r2.models import Subreddit
    from r2.lib.db.operators import desc
    from r2.lib.utils import fetch_things2

    q = Subreddit._query(Subreddit.c._spam == (True,False),
                         sort = desc('_date'))
    for sr in fetch_things2(q):
        sr.allow_top = True; sr._commit()
Exemple #28
0
    def load_accounts(inbox_rel):
        accounts = set()
        q = inbox_rel._query(eager_load=False, data=False, sort=desc("_date"))
        if min_date:
            q._filter(inbox_rel.c._date > min_date)

        for i in fetch_things2(q):
            accounts.add(i._thing1_id)

        return accounts
    def load_accounts(inbox_rel):
        accounts = set()
        q = inbox_rel._query(eager_load=False, data=False, sort=desc("_date"))
        if min_date:
            q._filter(inbox_rel.c._date > min_date)

        for i in fetch_things2(q):
            accounts.add(i._thing1_id)

        return accounts
Exemple #30
0
def add_allow_top_to_srs():
    "Add the allow_top property to all stored subreddits"
    from r2.models import Subreddit
    from r2.lib.db.operators import desc
    from r2.lib.utils import fetch_things2

    q = Subreddit._query(Subreddit.c._spam == (True,False),
                         sort = desc('_date'))
    for sr in fetch_things2(q):
        sr.allow_top = True; sr._commit()
Exemple #31
0
def port_cassahides():
    from r2.models import SaveHide, CassandraHide
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.db.operators import desc
    from r2.lib.utils import fetch_things2, timeago, progress

    q = SaveHide._query(SaveHide.c._date > timeago("1 week"), SaveHide.c._name == "hide", sort=desc("_date"))
    q = fetch_things2(q)
    q = progress(q, estimate=1953374)

    for sh in q:
        CassandraHide._hide(sh._thing1, sh._thing2, write_consistency_level=CL.ONE)
Exemple #32
0
def add_all_srs():
    """Recalculates every listing query for every subreddit. Very,
       very slow."""
    q = Subreddit._query(sort=asc("_date"))
    for sr in fetch_things2(q):
        for q in all_queries(get_links, sr, ("hot", "new"), ["all"]):
            q.update()
        for q in all_queries(get_links, sr, time_filtered_sorts, db_times.keys()):
            q.update()
        get_spam_links(sr).update()
        # get_spam_comments(sr).update()
        get_reported_links(sr).update()
    def houses_list(cls):
        # Get all the non-private subreddits that are houses...
        query = cls._query( cls.c.type != 'private',
                            cls.c.space_is_house == True,
                            sort = '_date',
                            data = True )
        houses = []
        for space in fetch_things2(query):
            houses.append( (space.name, space.house_rules) );

        # Sort and return them
        houses.sort()
        return houses
Exemple #34
0
def add_all_srs():
    """Recalculates every listing query for every subsciteit. Very,
       very slow."""
    q = Subsciteit._query(sort = asc('_date'))
    for sr in fetch_things2(q):
        for q in all_queries(get_links, sr, ('hot', 'new'), ['all'],no_children=True):
            q.update()
        for q in all_queries(get_links, sr, time_filtered_sorts, db_times.keys(),no_children=True):
            q.update()
        get_spam_links(sr).update()
        get_spam_comments(sr).update()
        get_reported_links(sr).update()
        get_reported_comments(sr).update()
Exemple #35
0
def add_all_srs():
    """Recalculates every listing query for every subreddit. Very,
       very slow."""
    q = Subreddit._query(sort = asc('_date'))
    for sr in fetch_things2(q):
        for q in all_queries(get_links, sr, ('hot', 'new'), ['all']):
            q.update()
        for q in all_queries(get_links, sr, time_filtered_sorts, db_times.keys()):
            q.update()
        get_spam_links(sr).update()
        get_spam_comments(sr).update()
        get_reported_links(sr).update()
        get_reported_comments(sr).update()
Exemple #36
0
def convert_promoted():
    """
    should only need to be run once to update old style promoted links
    to the new style.
    """
    from r2.lib.utils import fetch_things2
    from r2.lib import authorize

    q = Link._query(Link.c.promoted == (True, False),
                    sort = desc("_date"))
    sr_id = PromoteSR._id
    bid = 100
    with g.make_lock(promoted_lock_key):
        promoted = {}
        set_promoted({})
        for l in fetch_things2(q):
            print "updating:", l
            try:
                if not l._loaded: l._load()
                # move the promotion into the promo subdigg
                l.sr_id = sr_id
                # set it to accepted (since some of the update functions
                # check that it is not already promoted)
                l.promote_status = STATUS.accepted
                author = Account._byID(l.author_id)
                l.promote_trans_id = authorize.auth_transaction(bid, author, -1, l)
                l.promote_bid = bid
                l.maximum_clicks = None
                l.maximum_views = None
                # set the dates
                start = getattr(l, "promoted_on", l._date)
                until = getattr(l, "promote_until", None) or \
                    (l._date + timedelta(1))
                l.promote_until = None
                update_promo_dates(l, start, until)
                # mark it as promoted if it was promoted when we got there
                if l.promoted and l.promote_until > datetime.now(g.tz):
                    l.promote_status = STATUS.pending
                else:
                    l.promote_status = STATUS.finished
    
                if not hasattr(l, "disable_comments"):
                    l.disable_comments = False
                # add it to the auction list
                if l.promote_status == STATUS.pending and l._fullname not in promoted:
                    promoted[l._fullname] = auction_weight(l)
                l._commit()
            except AttributeError:
                print "BAD THING:", l
        print promoted
        set_promoted(promoted)
def upgrade_messages(update_comments=True,
                     update_messages=True,
                     update_trees=True):
    from r2.lib.db import queries
    from r2.lib import comment_tree, cache
    from r2.models import Account
    from pylons import app_globals as g
    accounts = set()

    def batch_fn(items):
        g.reset_caches()
        return items

    if update_messages or update_trees:
        q = Message._query(Message.c.new == True,
                           sort=desc("_date"),
                           data=True)
        for m in fetch_things2(q, batch_fn=batch_fn):
            print m, m._date
            if update_messages:
                accounts = accounts | queries.set_unread(m, m.new)
            else:
                accounts.add(m.to_id)
    if update_comments:
        q = Comment._query(Comment.c.new == True, sort=desc("_date"))
        q._filter(Comment.c._id < 26152162676)

        for m in fetch_things2(q, batch_fn=batch_fn):
            print m, m._date
            queries.set_unread(m, True)

    print "Precomputing comment trees for %d accounts" % len(accounts)

    for i, a in enumerate(accounts):
        if not isinstance(a, Account):
            a = Account._byID(a)
        print i, a
        comment_tree.user_messages(a)
Exemple #38
0
def add_all_srs():
    """Adds every listing query for every subreddit to the queue."""
    q = Subreddit._query(sort=asc("_date"))
    for sr in fetch_things2(q):
        add_queries(all_queries(get_links, sr, ("hot", "new"), ["all"]))
        add_queries(all_queries(get_links, sr, ("top", "controversial"), db_times.keys()))
        add_queries(
            [
                get_spam_links(sr),
                # get_spam_comments(sr),
                get_reported_links(sr),
                # get_reported_comments(sr),
            ]
        )
Exemple #39
0
def port_cassasaves(after_id=None, estimate=12489897):
    from r2.models import SaveHide, CassandraSave
    from r2.lib.db.operators import desc
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, to36, progress

    q = SaveHide._query(SaveHide.c._name == "save", sort=desc("_date"), data=False, eager_load=False)

    if after_id is not None:
        q._after(SaveHide._byID(after_id))

    for sh in progress(fetch_things2(q), estimate=estimate):

        csh = CassandraSave(thing1_id=to36(sh._thing1_id), thing2_id=to36(sh._thing2_id), date=sh._date)
        csh._commit(write_consistency_level=CL.ONE)
Exemple #40
0
def load_all_reddits():
    query_cache = {}

    q = Subreddit._query(Subreddit.c.type == 'public',
                         Subreddit.c._downs > 1,
                         sort=(desc('_downs'), desc('_ups')),
                         data=True)
    for sr in utils.fetch_things2(q):
        name = sr.name.lower()
        for i in xrange(len(name)):
            prefix = name[:i + 1]
            names = query_cache.setdefault(prefix, [])
            if len(names) < 10:
                names.append(sr.name)

    g.permacache.set_multi(query_cache, prefix=sr_prefix)
Exemple #41
0
def port_deleted_links(after_id=None):
    from r2.models import Link
    from r2.lib.db.operators import desc
    from r2.models.query_cache import CachedQueryMutator
    from r2.lib.db.queries import get_deleted_links
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc("_date"), data=True)
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, verbosity=1000)

    for chunk in in_chunks(q):
        with CachedQueryMutator() as m:
            for link in chunk:
                query = get_deleted_links(link.author_id)
                m.insert(query, [link])
Exemple #42
0
def load_all_reddits():
    query_cache = {}

    q = Subreddit._query(Subreddit.c.type == 'public',
                         Subreddit.c._downs > 1,
                         sort = (desc('_downs'), desc('_ups')),
                         data = True)
    for sr in utils.fetch_things2(q):
        name = sr.name.lower()
        for i in xrange(len(name)):
            prefix = name[:i + 1]
            names = query_cache.setdefault(prefix, [])
            if len(names) < 10:
                names.append(sr.name)

    g.permacache.set_multi(query_cache, prefix = sr_prefix)
Exemple #43
0
def port_cassahides():
    from r2.models import SaveHide, CassandraHide
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.db.operators import desc
    from r2.lib.utils import fetch_things2, timeago, progress

    q = SaveHide._query(SaveHide.c._date > timeago('1 week'),
                        SaveHide.c._name == 'hide',
                        sort=desc('_date'))
    q = fetch_things2(q)
    q = progress(q, estimate=1953374)

    for sh in q:
        CassandraHide._hide(sh._thing1,
                            sh._thing2,
                            write_consistency_level=CL.ONE)
Exemple #44
0
def rebuild_link_index(start_at=None,
                       sleeptime=1,
                       cls=Link,
                       uploader=LinkUploader,
                       doc_api='CLOUDSEARCH_DOC_API',
                       estimate=50000000,
                       chunk_size=1000):
    cache_key = _REBUILD_INDEX_CACHE_KEY % uploader.__name__.lower()
    doc_api = getattr(g, doc_api)
    uploader = uploader(doc_api)

    if start_at is _REBUILD_INDEX_CACHE_KEY:
        start_at = g.cache.get(cache_key)
        if not start_at:
            raise ValueError("Told me to use '%s' key, but it's not set" %
                             cache_key)

    q = cls._query(cls.c._deleted == (True, False),
                   sort=desc('_date'),
                   data=True)
    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)
    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q,
                         verbosity=1000,
                         estimate=estimate,
                         persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        #uploader.things = chunk
        uploader.fullnames = [link._fullname for link in chunk]
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        g.cache.set(cache_key, last_update._fullname)
        time.sleep(sleeptime)
Exemple #45
0
def cache_lists():
    def _chop(srs):
        srs.sort(key=lambda s: s._downs, reverse=True)
        return srs[:limit]

    # bylang    =:= dict((lang, over18_state) -> [Subreddit])
    # lang      =:= all | lang()
    # nsfwstate =:= no_over18 | allow_over18 | only_over18
    bylang = {}

    for sr in fetch_things2(Subreddit._query(sort=desc('_date'),
                                             data=True)):
        aid = getattr(sr, 'author_id', None)
        if aid is not None and aid < 0:
            # skip special system reddits like promos
            continue

        if sr.type not in ('public', 'restricted'):
            # skips reddits that can't appear in the default list
            # because of permissions
            continue

        g.log.debug(sr.name)
        for lang in 'all', sr.lang:
            over18s = ['allow_over18']
            if sr.over_18:
                over18s.append('only_over18')
            else:
                over18s.append('no_over18')

            for over18 in over18s:
                k = (lang, over18)
                bylang.setdefault(k, []).append(sr)

                # keep the lists small while we work
                if len(bylang[k]) > limit*2:
                    g.log.debug('Shrinking %s' % (k,))
                    bylang[k] = _chop(bylang[k])

    for (lang, over18), srs in bylang.iteritems():
        srs = _chop(srs)
        sr_tuples = map(lambda sr: (sr._downs, sr.allow_top, sr._id), srs)

        g.log.debug("For %s/%s setting %s" % (lang, over18,
                                              map(lambda sr: sr.name, srs)))

        g.permacache.set(cached_srs_key(lang, over18), sr_tuples)
Exemple #46
0
def load_all_reddits():
    query_cache = {}

    q = Subreddit._query(Subreddit.c.type == 'public',
                         Subreddit.c._downs > 1,
                         sort = (desc('_downs'), desc('_ups')),
                         data = True)
    for sr in utils.fetch_things2(q):
        name = sr.name.lower()
        for i in xrange(len(name)):
            prefix = name[:i + 1]
            names = query_cache.setdefault(prefix, [])
            if len(names) < 10:
                names.append((sr.name, sr.over_18))

    for name_prefix, subreddits in query_cache.iteritems():
        SubredditsByPartialName._set_values(name_prefix, {'tups': subreddits})
def load_all_reddits():
    query_cache = {}

    q = Subreddit._query(Subreddit.c.type == 'public',
                         Subreddit.c._downs > 1,
                         sort=(desc('_downs'), desc('_ups')),
                         data=True)
    for sr in utils.fetch_things2(q):
        name = sr.name.lower()
        for i in xrange(len(name)):
            prefix = name[:i + 1]
            names = query_cache.setdefault(prefix, [])
            if len(names) < 10:
                names.append(sr.name)

    for name_prefix, subreddits in query_cache.iteritems():
        SubredditsByPartialName._set_values(name_prefix, {'srs': subreddits})
Exemple #48
0
def cache_lists():
    def _chop(srs):
        srs.sort(key=lambda s: s._downs, reverse=True)
        return srs[:limit]

    # bylang    =:= dict((lang, over18_state) -> [Subreddit])
    # lang      =:= all | lang()
    # nsfwstate =:= no_over18 | allow_over18 | only_over18
    bylang = {}

    for sr in fetch_things2(Subreddit._query(sort=desc('_date'),
                                             data=True)):
        aid = getattr(sr, 'author_id', None)
        if aid is not None and aid < 0:
            # skip special system reddits like promos
            continue

        type = getattr(sr, 'type', 'private')
        if type not in ('public', 'restricted'):
            # skips reddits that can't appear in the default list
            # because of permissions
            continue

        for lang in 'all', sr.lang:
            over18s = ['allow_over18']
            if sr.over_18:
                over18s.append('only_over18')
            else:
                over18s.append('no_over18')

            for over18 in over18s:
                k = (lang, over18)
                bylang.setdefault(k, []).append(sr)

                # keep the lists small while we work
                if len(bylang[k]) > limit*2:
                    bylang[k] = _chop(bylang[k])

    for (lang, over18), srs in bylang.iteritems():
        srs = _chop(srs)
        sr_tuples = map(lambda sr: (sr._downs, sr.allow_top, sr._id), srs)

        print "For %s/%s setting %s" % (lang, over18,
                                        map(lambda sr: sr.name, srs[:50]))

        SubredditPopularityByLanguage._set_values(lang, {over18: sr_tuples})
def queue_summary_emails():
    start = datetime.datetime.now()
    # find all accounts that should get an email

    # this implementation is slow, as it iterates over all accounts that have an email
    # address.  One idea to make it faster is to turn the "last_email_sent_at" data 
    # attribute into an actual sql column you can query

    accounts = fetch_things2(Account._query(Account.c.email != None, sort=asc('_date')))
    for account in accounts:
        if should_send_activity_summary_email(account):
            # using _add_item over add_item as that skips using a daemon thread to talk
            # to the amqp server that might not finish it's job before the process exits
            amqp._add_item('summary_email_q', str(account._id))
            print "Queued summary email for %r" % (account.email,)
    end = datetime.datetime.now()
    print "Time to scan accounts to queue emails: %s" % (end - start)
Exemple #50
0
def add_byurl_prefix():
    """Run one before the byurl prefix is set, and once after (killing
       it after it gets when it started the first time"""

    from datetime import datetime
    from r2.models import Link
    from r2.lib.filters import _force_utf8
    from pylons import g
    from r2.lib.utils import fetch_things2
    from r2.lib.db.operators import desc
    from r2.lib.utils import base_url

    now = datetime.now(g.tz)
    print 'started at %s' % (now,)

    l_q = Link._query(
        Link.c._date < now,
        data=True,
        sort=desc('_date'))

    # from link.py
    def by_url_key(url, prefix=''):
        s = _force_utf8(base_url(url.lower()))
        return '%s%s' % (prefix, s)

    done = 0
    for links in fetch_things2(l_q, 1000, chunks=True):
        done += len(links)
        print 'Doing: %r, %s..%s' % (done, links[-1]._date, links[0]._date)

        # only links with actual URLs
        links = filter(lambda link: (not getattr(link, 'is_self', False)
                                     and getattr(link, 'url', '')),
                       links)

        # old key -> new key
        translate = dict((by_url_key(link.url),
                          by_url_key(link.url, prefix='byurl_'))
                         for link in links)

        old = g.permacache.get_multi(translate.keys())
        new = dict((translate[old_key], value)
                   for (old_key, value)
                   in old.iteritems())
        g.permacache.set_multi(new)
Exemple #51
0
def backfill_deleted_accounts(resume_id=None):
    del_accts = Account._query(Account.c._deleted == True, sort=desc('_date'))
    if resume_id:
        del_accts._filter(Account.c._id < resume_id)

    for i, account in enumerate(progress(fetch_things2(del_accts))):
        # Don't kill the rabbit! Wait for the relevant queues to calm down.
        if i % 1000 == 0:
            del_len = get_queue_length('del_account_q')
            cs_len = get_queue_length('cloudsearch_changes')
            while (del_len > 1000 or cs_len > 10000):
                sys.stderr.write(("CS: %d, DEL: %d" % (cs_len, del_len)) +
                                 "\n")
                sys.stderr.flush()
                time.sleep(1)
                del_len = get_queue_length('del_account_q')
                cs_len = get_queue_length('cloudsearch_changes')
        amqp.add_item('account_deleted', account._fullname)
Exemple #52
0
def port_deleted_links(after_id=None):
    from r2.models import Link
    from r2.lib.db.operators import desc
    from r2.models.query_cache import CachedQueryMutator
    from r2.lib.db.queries import get_deleted_links
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._deleted == True,
                    Link.c._spam == (True, False),
                    sort=desc('_date'), data=True)
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, verbosity=1000)

    for chunk in in_chunks(q):
        with CachedQueryMutator() as m:
            for link in chunk:
                query = get_deleted_links(link.author_id)
                m.insert(query, [link])
Exemple #53
0
def port_cassasaves(after_id=None, estimate=12489897):
    from r2.models import SaveHide, CassandraSave
    from r2.lib.db.operators import desc
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, to36, progress

    q = SaveHide._query(SaveHide.c._name == 'save',
                        sort=desc('_date'),
                        data=False,
                        eager_load=False)

    if after_id is not None:
        q._after(SaveHide._byID(after_id))

    for sh in progress(fetch_things2(q), estimate=estimate):

        csh = CassandraSave(thing1_id=to36(sh._thing1_id),
                            thing2_id=to36(sh._thing2_id),
                            date=sh._date)
        csh._commit(write_consistency_level=CL.ONE)
def port_cassaurls(after_id=None, estimate=15231317):
    from r2.models import Link, LinksByUrlAndSubreddit
    from r2.lib.db import tdb_cassandra
    from r2.lib.db.operators import desc
    from r2.lib.db.tdb_cassandra import CL
    from r2.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    if after_id:
        q._after(Link._byID(after_id, data=True))
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, estimate=estimate)
    q = (l for l in q if getattr(l, 'url', 'self') != 'self'
         and not getattr(l, 'is_self', False))
    chunks = in_chunks(q, 500)

    for chunk in chunks:
        for l in chunk:
            LinksByUrlAndSubreddit.add_link(l)
Exemple #55
0
def _populate(after_id=None, estimate=54301242):
    from r2.models import desc
    from r2.lib.db import tdb_cassandra
    from r2.lib import utils

    # larger has a chance to decrease the number of Cassandra writes,
    # but the probability is low
    chunk_size = 5000

    q = Comment._query(Comment.c._spam == (True, False),
                       Comment.c._deleted == (True, False),
                       sort=desc('_date'))

    if after_id is not None:
        q._after(Comment._byID(after_id))

    q = utils.fetch_things2(q, chunk_size=chunk_size)
    q = utils.progress(q, verbosity=chunk_size, estimate=estimate)

    for chunk in utils.in_chunks(q, chunk_size):
        chunk = filter(lambda x: hasattr(x, 'link_id'), chunk)
        update_comment_votes(chunk)
Exemple #56
0
def rebuild_index(start_at=None,
                  sleeptime=1,
                  cls=Link,
                  estimate=50000000,
                  chunk_size=1000):
    if start_at is _REBUILD_INDEX_CACHE_KEY:
        start_at = g.cache.get(start_at)
        if not start_at:
            raise ValueError("Told me to use '%s' key, but it's not set" %
                             _REBUILD_INDEX_CACHE_KEY)

    q = cls._query(cls.c._deleted == (True, False),
                   sort=desc('_date'),
                   data=True)
    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)
    q = r2utils.fetch_things2(q, chunk_size=chunk_size)
    q = r2utils.progress(q,
                         verbosity=1000,
                         estimate=estimate,
                         persec=True,
                         key=_progress_key)
    for chunk in r2utils.in_chunks(q, size=chunk_size):
        for x in range(5):
            try:
                inject(chunk)
            except httplib.HTTPException as err:
                print "Got  %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        g.cache.set(_REBUILD_INDEX_CACHE_KEY, last_update._fullname)
        time.sleep(sleeptime)
Exemple #57
0
def catch_up_batch_queries():
    # catch up on batched_time_times queries that haven't been run
    # that should be, This should be cronned to run about once an
    # hour. The more often, the more the work of rerunning the actual
    # queries is spread out, but every run has a fixed-cost of looking
    # at every single subreddit
    sr_q = Subreddit._query(sort=desc('_downs'), data=True)
    dayago = utils.timeago('1 day')
    for sr in fetch_things2(sr_q):
        if hasattr(sr, 'last_valid_vote') and sr.last_valid_vote > dayago:
            # if we don't know when the last vote was, it couldn't
            # have been today
            for sort in batched_time_sorts:
                for time in batched_time_times:
                    q = make_batched_time_query(sr, sort, time)
                    if q.preflight_check():
                        # we haven't run the batched_time_times in the
                        # last day
                        add_queries([q])

    # make sure that all of the jobs have been completed or processed
    # by the time we return
    worker.join()
Exemple #58
0
def get_srmembers(after_user_id):
    previous_user_id = None

    while True:
        # there isn't a good index on rel_id so we need to get a new query
        # for each batch rather than relying solely on fetch_things2
        q = get_query(after_user_id)
        users_seen = 0

        for rel in fetch_things2(q):
            user_id = rel._thing2_id

            if user_id != previous_user_id:
                if users_seen >= 20:
                    # set after_user_id to the previous id so we will pick up
                    # the query at this same point
                    after_user_id = previous_user_id
                    break

                users_seen += 1
                previous_user_id = user_id

            yield rel