def upgrade_messages(update_comments=True, update_messages=True, update_trees=True): from r2.lib.db import queries from r2.lib import comment_tree, cache from r2.models import Account from pylons import app_globals as g accounts = set() def batch_fn(items): g.reset_caches() return items if update_messages or update_trees: q = Message._query(Message.c.new == True, sort=desc("_date"), data=True) for m in fetch_things2(q, batch_fn=batch_fn): print m, m._date if update_messages: accounts = accounts | queries.set_unread(m, m.new) else: accounts.add(m.to_id) if update_comments: q = Comment._query(Comment.c.new == True, sort=desc("_date")) q._filter(Comment.c._id < 26152162676) for m in fetch_things2(q, batch_fn=batch_fn): print m, m._date queries.set_unread(m, True) print "Precomputing comment trees for %d accounts" % len(accounts) for i, a in enumerate(accounts): if not isinstance(a, Account): a = Account._byID(a) print i, a comment_tree.user_messages(a)
def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_reddits() yield queries.get_all_comments().iden l_q = Link._query(Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc('_date'), data=True, ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, 'comments') a_q = Account._query(Account.c._spam == (True, False), sort=desc('_date'), ) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, 'overview') yield last_modified_key(account, 'commented') yield last_modified_key(account, 'submitted') yield last_modified_key(account, 'liked') yield last_modified_key(account, 'disliked') yield queries.get_comments(account, 'new', 'all').iden yield queries.get_submitted(account, 'new', 'all').iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subreddit._query(Subreddit.c._spam == (True, False), sort=desc('_date'), ) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, 'stylesheet_contents') yield queries.get_links(sr, 'hot', 'all').iden yield queries.get_links(sr, 'new', 'all').iden for sort in 'top', 'controversial': for time in 'hour', 'day', 'week', 'month', 'year', 'all': yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subreddit_messages(sr).iden yield queries.get_unread_subreddit_messages(sr).iden
def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_reddits() yield queries.get_all_comments().iden l_q = Link._query( Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc("_date"), data=True ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, "comments") a_q = Account._query(Account.c._spam == (True, False), sort=desc("_date")) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, "overview") yield last_modified_key(account, "commented") yield last_modified_key(account, "submitted") yield last_modified_key(account, "liked") yield last_modified_key(account, "disliked") yield queries.get_comments(account, "new", "all").iden yield queries.get_submitted(account, "new", "all").iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subreddit._query(Subreddit.c._spam == (True, False), sort=desc("_date")) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, "stylesheet_contents") yield queries.get_links(sr, "hot", "all").iden yield queries.get_links(sr, "new", "all").iden for sort in "top", "controversial": for time in "hour", "day", "week", "month", "year", "all": yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subreddit_messages(sr).iden yield queries.get_unread_subreddit_messages(sr).iden
def add_all_srs(): """Adds every listing query for every subreddit to the queue.""" q = Subreddit._query(sort = asc('_date')) for sr in fetch_things2(q): add_queries(all_queries(get_links, sr, ('hot', 'new', 'old'), ['all'])) add_queries(all_queries(get_links, sr, ('top', 'controversial'), db_times.keys())) add_queries([get_links(sr, 'toplinks', 'all')])
def reset_last_email_sent_at_for_all_accounts(): start_of_epoc = pytz.utc.localize(datetime.datetime.utcfromtimestamp(0)) accounts = fetch_things2(Account._query(Account.c.email != None, sort=asc('_date'))) for account in accounts: account.last_email_sent_at = start_of_epoc account._commit()
def rebuild_index(start_at=None, sleeptime=1, cls=Link, estimate=50000000, chunk_size=1000): if start_at is _REBUILD_INDEX_CACHE_KEY: start_at = g.cache.get(start_at) if not start_at: raise ValueError("Told me to use '%s' key, but it's not set" % _REBUILD_INDEX_CACHE_KEY) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'), data=True) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): for x in range(5): try: inject(chunk) except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] g.cache.set(_REBUILD_INDEX_CACHE_KEY, last_update._fullname) time.sleep(sleeptime)
def port_cassavotes(): from r2.models import Vote, Account, Link, Comment from r2.models.vote import CassandraVote, CassandraLinkVote, CassandraCommentVote from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, to36, progress ts = [(Vote.rel(Account, Link), CassandraLinkVote), (Vote.rel(Account, Comment), CassandraCommentVote)] dataattrs = set(['valid_user', 'valid_thing', 'ip', 'organic']) for prel, crel in ts: vq = prel._query(sort=desc('_date'), data=True, eager_load=False) vq = fetch_things2(vq) vq = progress(vq, persec=True) for v in vq: t1 = to36(v._thing1_id) t2 = to36(v._thing2_id) cv = crel(thing1_id = t1, thing2_id = t2, date=v._date, name=v._name) for dkey, dval in v._t.iteritems(): if dkey in dataattrs: setattr(cv, dkey, dval) cv._commit(write_consistency_level=CL.ONE)
def process_new_links(period=media_period, force=False): """Fetches links from the last period and sets their media properities. If force is True, it will fetch properities for links even if the properties already exist""" links = Link._query(Link.c._date > timeago(period), sort=desc('_date'), data=True) results = {} jobs = [] for link in fetch_things2(links): if link.is_self or link.promoted: continue elif not force and (link.has_thumbnail or link.media_object): continue jobs.append(make_link_info_job(results, link, g.useragent)) #send links to a queue wq = WorkQueue(jobs, num_workers=20, timeout=30) wq.start() wq.jobs.join() #when the queue is finished, do the db writes in this thread for link, info in results.items(): update_link(link, info[0], info[1])
def test_send_summary_emails(): accounts = fetch_things2(Account._query(Account.c.email != None, sort=asc('_date'))) for account in accounts: a_day_ago = datetime.datetime.now(pytz.utc) - datetime.timedelta(hours=24) account.last_email_sent_at = a_day_ago account._commit() send_account_summary_email(account._id, verbose=True)
def convert_old_media_objects(): q = Link._query(Link.c.media_object is not None, Link.c._date > whenever, data = True) for link in utils.fetch_things2(q): if not getattr(link, 'media_object', None): continue if 'youtube' in link.media_object: # we can rewrite this one without scraping video_id = YoutubeScraper.video_id_rx.match(link.url) link.media_object = dict(type='youtube.com', video_id = video_id.group(1)) elif ('video.google.com' in link.media_object or 'metacafe' in link.media_object): scraper = make_scraper(link.url) if not scraper: continue mo = scraper.media_object() if not mo: continue link.media_object = mo else: print "skipping %s because it confuses me" % link._fullname continue link._commit()
def _rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=SolrLinkUploader, estimate=50000000, chunk_size=1000): uploader = uploader() q = cls._query(cls.c._deleted == (True, False), sort=desc('_date')) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): uploader.things = chunk uploader.fullnames = [c._fullname for c in chunk] for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] print "last updated %s" % last_update._fullname time.sleep(sleeptime)
def populate_spam_filtered(): from r2.lib.db.queries import get_spam_links, get_spam_comments from r2.lib.db.queries import get_spam_filtered_links, get_spam_filtered_comments from r2.models.query_cache import CachedQueryMutator def was_filtered(thing): if thing._spam and not thing._deleted and \ getattr(thing, 'verdict', None) != 'mod-removed': return True else: return False q = Subreddit._query(sort=asc('_date')) for sr in fetch_things2(q): print 'Processing %s' % sr.name links = Thing._by_fullname(get_spam_links(sr), data=True, return_dict=False) comments = Thing._by_fullname(get_spam_comments(sr), data=True, return_dict=False) insert_links = [l for l in links if was_filtered(l)] insert_comments = [c for c in comments if was_filtered(c)] with CachedQueryMutator() as m: m.insert(get_spam_filtered_links(sr), insert_links) m.insert(get_spam_filtered_comments(sr), insert_comments)
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): doc_api = getattr(g, doc_api) uploader = uploader(doc_api) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date')) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] print "last updated %s" % last_update._fullname time.sleep(sleeptime)
def backfill(after=None): q = Subreddit._query(sort=asc('_date')) if after: sr = Subreddit._by_name(after) q = q._after(sr) for sr in fetch_things2(q): backfill_sr(sr)
def by_url_cache(): q = Link._query(Link.c._spam == (True,False), data = True, sort = desc('_date')) for i, link in enumerate(fetch_things2(q)): if i % 100 == 0: print "%s..." % i link.set_url_cache()
def get_participated(): users = {} q = Account._query(Account.c.f2p != "", sort=asc("_date"), data=True) for user in progress(fetch_things2(q)): users[user._fullname] = user.f2p return users
def by_url_cache(): q = Link._query(Link.c._spam == (True, False), data=True, sort=desc('_date')) for i, link in enumerate(fetch_things2(q)): if i % 100 == 0: print "%s..." % i link.set_url_cache()
def shorten_byurl_keys(): """We changed by_url keys from a format like byurl_google.com... to: byurl(1d5920f4b44b27a802bd77c4f0536f5a, google.com...) so that they would fit in memcache's 251-char limit """ from datetime import datetime from hashlib import md5 from r2.models import Link from r2.lib.filters import _force_utf8 from pylons import g from r2.lib.utils import fetch_things2, in_chunks from r2.lib.db.operators import desc from r2.lib.utils import base_url, progress # from link.py def old_by_url_key(url): prefix = 'byurl_' s = _force_utf8(base_url(url.lower())) return '%s%s' % (prefix, s) def new_by_url_key(url): maxlen = 250 template = 'byurl(%s,%s)' keyurl = _force_utf8(base_url(url.lower())) hexdigest = md5(keyurl).hexdigest() usable_len = maxlen - len(template) - len(hexdigest) return template % (hexdigest, keyurl[:usable_len]) verbosity = 1000 l_q = Link._query(Link.c._spam == (True, False), data=True, sort=desc('_date')) for links in (in_chunks( progress( fetch_things2(l_q, verbosity), key=lambda link: link._date, verbosity=verbosity, estimate=int(9.9e6), persec=True, ), verbosity)): # only links with actual URLs links = filter( lambda link: (not getattr(link, 'is_self', False) and getattr(link, 'url', '')), links) # old key -> new key translate = dict((old_by_url_key(link.url), new_by_url_key(link.url)) for link in links) old = g.permacache.get_multi(translate.keys()) new = dict((translate[old_key], value) for (old_key, value) in old.iteritems()) g.permacache.set_multi(new)
def send_account_summary_email(account_thing_id, verbose=False, send_email=send_email): account = Account._byID(account_thing_id, data=True) if not should_send_activity_summary_email(account): return # if we've never sent an email, only tell about the last 24 hours a_day_ago = datetime.datetime.now(pytz.utc) - datetime.timedelta(hours=24) if getattr(account, 'last_email_sent_at', None) is None: account.last_email_sent_at = a_day_ago c.content_langs = 'en-US' # Find all the "active" links for this user. Frontpage uses the c.user global # to find the right subreddits for the current user c.user = account c.user_is_loggedin = True thing_ids = [] for link in Frontpage.get_links('active', 'all'): thing_ids.append(link) active_links_hash = Link._by_fullname(thing_ids, data=True) active_links = [active_links_hash[t_id] for t_id in thing_ids if active_links_hash[t_id]._active > account.last_email_sent_at] idx = 0 for ll in active_links: idx += 1 ll.num = idx # Find all new spaces created since we last sent the user an email new_spaces = list(fetch_things2(Subreddit._query( Subreddit.c._date > account.last_email_sent_at, sort=asc('_date')))) # don't bother sending email if there's noting to report. if len(new_spaces) == 0 and len(active_links) == 0: return # Get the date and time now = datetime.datetime.now(pytz.timezone('US/Eastern')) date_string = now.strftime("%A %B %d, %Y") time_string = now.strftime("%I:%M %p") # Render the template html_email_template = g.mako_lookup.get_template('summary_email.html') html_body = html_email_template.render( last_email_sent_at=account.last_email_sent_at, new_spaces=new_spaces, active_links=active_links, date_string=date_string, time_string=time_string) # with open('out.html', 'w') as ff: # ff.write(html_body) if verbose: print "sending email to %s" % (account.email,) send_email(account.email, html_body, date_string) account.last_email_sent_at = datetime.datetime.now(pytz.utc) account._commit()
def add_all_ban_report_srs(): """Adds the initial spam/reported pages to the report queue""" q = Subreddit._query(sort = asc('_date')) for sr in fetch_things2(q): add_queries([get_spam_links(sr), get_spam_comments(sr), get_reported_links(sr), get_reported_comments(sr), ])
def backfill_campaign_targets(): from r2.lib.db.operators import desc from r2.lib.utils import fetch_things2 q = PromoCampaign._query(sort=desc("_date"), data=True) for campaign in fetch_things2(q): sr_name = campaign.sr_name or Frontpage.name campaign.target = Target(sr_name) campaign._commit()
def add_allow_top_to_srs(): "Add the allow_top property to all stored subreddits" from r2.models import Subreddit from r2.lib.db.operators import desc from r2.lib.utils import fetch_things2 q = Subreddit._query(Subreddit.c._spam == (True,False), sort = desc('_date')) for sr in fetch_things2(q): sr.allow_top = True; sr._commit()
def load_accounts(inbox_rel): accounts = set() q = inbox_rel._query(eager_load=False, data=False, sort=desc("_date")) if min_date: q._filter(inbox_rel.c._date > min_date) for i in fetch_things2(q): accounts.add(i._thing1_id) return accounts
def port_cassahides(): from r2.models import SaveHide, CassandraHide from r2.lib.db.tdb_cassandra import CL from r2.lib.db.operators import desc from r2.lib.utils import fetch_things2, timeago, progress q = SaveHide._query(SaveHide.c._date > timeago("1 week"), SaveHide.c._name == "hide", sort=desc("_date")) q = fetch_things2(q) q = progress(q, estimate=1953374) for sh in q: CassandraHide._hide(sh._thing1, sh._thing2, write_consistency_level=CL.ONE)
def add_all_srs(): """Recalculates every listing query for every subreddit. Very, very slow.""" q = Subreddit._query(sort=asc("_date")) for sr in fetch_things2(q): for q in all_queries(get_links, sr, ("hot", "new"), ["all"]): q.update() for q in all_queries(get_links, sr, time_filtered_sorts, db_times.keys()): q.update() get_spam_links(sr).update() # get_spam_comments(sr).update() get_reported_links(sr).update()
def houses_list(cls): # Get all the non-private subreddits that are houses... query = cls._query( cls.c.type != 'private', cls.c.space_is_house == True, sort = '_date', data = True ) houses = [] for space in fetch_things2(query): houses.append( (space.name, space.house_rules) ); # Sort and return them houses.sort() return houses
def add_all_srs(): """Recalculates every listing query for every subsciteit. Very, very slow.""" q = Subsciteit._query(sort = asc('_date')) for sr in fetch_things2(q): for q in all_queries(get_links, sr, ('hot', 'new'), ['all'],no_children=True): q.update() for q in all_queries(get_links, sr, time_filtered_sorts, db_times.keys(),no_children=True): q.update() get_spam_links(sr).update() get_spam_comments(sr).update() get_reported_links(sr).update() get_reported_comments(sr).update()
def add_all_srs(): """Recalculates every listing query for every subreddit. Very, very slow.""" q = Subreddit._query(sort = asc('_date')) for sr in fetch_things2(q): for q in all_queries(get_links, sr, ('hot', 'new'), ['all']): q.update() for q in all_queries(get_links, sr, time_filtered_sorts, db_times.keys()): q.update() get_spam_links(sr).update() get_spam_comments(sr).update() get_reported_links(sr).update() get_reported_comments(sr).update()
def convert_promoted(): """ should only need to be run once to update old style promoted links to the new style. """ from r2.lib.utils import fetch_things2 from r2.lib import authorize q = Link._query(Link.c.promoted == (True, False), sort = desc("_date")) sr_id = PromoteSR._id bid = 100 with g.make_lock(promoted_lock_key): promoted = {} set_promoted({}) for l in fetch_things2(q): print "updating:", l try: if not l._loaded: l._load() # move the promotion into the promo subdigg l.sr_id = sr_id # set it to accepted (since some of the update functions # check that it is not already promoted) l.promote_status = STATUS.accepted author = Account._byID(l.author_id) l.promote_trans_id = authorize.auth_transaction(bid, author, -1, l) l.promote_bid = bid l.maximum_clicks = None l.maximum_views = None # set the dates start = getattr(l, "promoted_on", l._date) until = getattr(l, "promote_until", None) or \ (l._date + timedelta(1)) l.promote_until = None update_promo_dates(l, start, until) # mark it as promoted if it was promoted when we got there if l.promoted and l.promote_until > datetime.now(g.tz): l.promote_status = STATUS.pending else: l.promote_status = STATUS.finished if not hasattr(l, "disable_comments"): l.disable_comments = False # add it to the auction list if l.promote_status == STATUS.pending and l._fullname not in promoted: promoted[l._fullname] = auction_weight(l) l._commit() except AttributeError: print "BAD THING:", l print promoted set_promoted(promoted)
def add_all_srs(): """Adds every listing query for every subreddit to the queue.""" q = Subreddit._query(sort=asc("_date")) for sr in fetch_things2(q): add_queries(all_queries(get_links, sr, ("hot", "new"), ["all"])) add_queries(all_queries(get_links, sr, ("top", "controversial"), db_times.keys())) add_queries( [ get_spam_links(sr), # get_spam_comments(sr), get_reported_links(sr), # get_reported_comments(sr), ] )
def port_cassasaves(after_id=None, estimate=12489897): from r2.models import SaveHide, CassandraSave from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, to36, progress q = SaveHide._query(SaveHide.c._name == "save", sort=desc("_date"), data=False, eager_load=False) if after_id is not None: q._after(SaveHide._byID(after_id)) for sh in progress(fetch_things2(q), estimate=estimate): csh = CassandraSave(thing1_id=to36(sh._thing1_id), thing2_id=to36(sh._thing2_id), date=sh._date) csh._commit(write_consistency_level=CL.ONE)
def load_all_reddits(): query_cache = {} q = Subreddit._query(Subreddit.c.type == 'public', Subreddit.c._downs > 1, sort=(desc('_downs'), desc('_ups')), data=True) for sr in utils.fetch_things2(q): name = sr.name.lower() for i in xrange(len(name)): prefix = name[:i + 1] names = query_cache.setdefault(prefix, []) if len(names) < 10: names.append(sr.name) g.permacache.set_multi(query_cache, prefix=sr_prefix)
def port_deleted_links(after_id=None): from r2.models import Link from r2.lib.db.operators import desc from r2.models.query_cache import CachedQueryMutator from r2.lib.db.queries import get_deleted_links from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc("_date"), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def load_all_reddits(): query_cache = {} q = Subreddit._query(Subreddit.c.type == 'public', Subreddit.c._downs > 1, sort = (desc('_downs'), desc('_ups')), data = True) for sr in utils.fetch_things2(q): name = sr.name.lower() for i in xrange(len(name)): prefix = name[:i + 1] names = query_cache.setdefault(prefix, []) if len(names) < 10: names.append(sr.name) g.permacache.set_multi(query_cache, prefix = sr_prefix)
def port_cassahides(): from r2.models import SaveHide, CassandraHide from r2.lib.db.tdb_cassandra import CL from r2.lib.db.operators import desc from r2.lib.utils import fetch_things2, timeago, progress q = SaveHide._query(SaveHide.c._date > timeago('1 week'), SaveHide.c._name == 'hide', sort=desc('_date')) q = fetch_things2(q) q = progress(q, estimate=1953374) for sh in q: CassandraHide._hide(sh._thing1, sh._thing2, write_consistency_level=CL.ONE)
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): cache_key = _REBUILD_INDEX_CACHE_KEY % uploader.__name__.lower() doc_api = getattr(g, doc_api) uploader = uploader(doc_api) if start_at is _REBUILD_INDEX_CACHE_KEY: start_at = g.cache.get(cache_key) if not start_at: raise ValueError("Told me to use '%s' key, but it's not set" % cache_key) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'), data=True) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): #uploader.things = chunk uploader.fullnames = [link._fullname for link in chunk] for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] g.cache.set(cache_key, last_update._fullname) time.sleep(sleeptime)
def cache_lists(): def _chop(srs): srs.sort(key=lambda s: s._downs, reverse=True) return srs[:limit] # bylang =:= dict((lang, over18_state) -> [Subreddit]) # lang =:= all | lang() # nsfwstate =:= no_over18 | allow_over18 | only_over18 bylang = {} for sr in fetch_things2(Subreddit._query(sort=desc('_date'), data=True)): aid = getattr(sr, 'author_id', None) if aid is not None and aid < 0: # skip special system reddits like promos continue if sr.type not in ('public', 'restricted'): # skips reddits that can't appear in the default list # because of permissions continue g.log.debug(sr.name) for lang in 'all', sr.lang: over18s = ['allow_over18'] if sr.over_18: over18s.append('only_over18') else: over18s.append('no_over18') for over18 in over18s: k = (lang, over18) bylang.setdefault(k, []).append(sr) # keep the lists small while we work if len(bylang[k]) > limit*2: g.log.debug('Shrinking %s' % (k,)) bylang[k] = _chop(bylang[k]) for (lang, over18), srs in bylang.iteritems(): srs = _chop(srs) sr_tuples = map(lambda sr: (sr._downs, sr.allow_top, sr._id), srs) g.log.debug("For %s/%s setting %s" % (lang, over18, map(lambda sr: sr.name, srs))) g.permacache.set(cached_srs_key(lang, over18), sr_tuples)
def load_all_reddits(): query_cache = {} q = Subreddit._query(Subreddit.c.type == 'public', Subreddit.c._downs > 1, sort = (desc('_downs'), desc('_ups')), data = True) for sr in utils.fetch_things2(q): name = sr.name.lower() for i in xrange(len(name)): prefix = name[:i + 1] names = query_cache.setdefault(prefix, []) if len(names) < 10: names.append((sr.name, sr.over_18)) for name_prefix, subreddits in query_cache.iteritems(): SubredditsByPartialName._set_values(name_prefix, {'tups': subreddits})
def load_all_reddits(): query_cache = {} q = Subreddit._query(Subreddit.c.type == 'public', Subreddit.c._downs > 1, sort=(desc('_downs'), desc('_ups')), data=True) for sr in utils.fetch_things2(q): name = sr.name.lower() for i in xrange(len(name)): prefix = name[:i + 1] names = query_cache.setdefault(prefix, []) if len(names) < 10: names.append(sr.name) for name_prefix, subreddits in query_cache.iteritems(): SubredditsByPartialName._set_values(name_prefix, {'srs': subreddits})
def cache_lists(): def _chop(srs): srs.sort(key=lambda s: s._downs, reverse=True) return srs[:limit] # bylang =:= dict((lang, over18_state) -> [Subreddit]) # lang =:= all | lang() # nsfwstate =:= no_over18 | allow_over18 | only_over18 bylang = {} for sr in fetch_things2(Subreddit._query(sort=desc('_date'), data=True)): aid = getattr(sr, 'author_id', None) if aid is not None and aid < 0: # skip special system reddits like promos continue type = getattr(sr, 'type', 'private') if type not in ('public', 'restricted'): # skips reddits that can't appear in the default list # because of permissions continue for lang in 'all', sr.lang: over18s = ['allow_over18'] if sr.over_18: over18s.append('only_over18') else: over18s.append('no_over18') for over18 in over18s: k = (lang, over18) bylang.setdefault(k, []).append(sr) # keep the lists small while we work if len(bylang[k]) > limit*2: bylang[k] = _chop(bylang[k]) for (lang, over18), srs in bylang.iteritems(): srs = _chop(srs) sr_tuples = map(lambda sr: (sr._downs, sr.allow_top, sr._id), srs) print "For %s/%s setting %s" % (lang, over18, map(lambda sr: sr.name, srs[:50])) SubredditPopularityByLanguage._set_values(lang, {over18: sr_tuples})
def queue_summary_emails(): start = datetime.datetime.now() # find all accounts that should get an email # this implementation is slow, as it iterates over all accounts that have an email # address. One idea to make it faster is to turn the "last_email_sent_at" data # attribute into an actual sql column you can query accounts = fetch_things2(Account._query(Account.c.email != None, sort=asc('_date'))) for account in accounts: if should_send_activity_summary_email(account): # using _add_item over add_item as that skips using a daemon thread to talk # to the amqp server that might not finish it's job before the process exits amqp._add_item('summary_email_q', str(account._id)) print "Queued summary email for %r" % (account.email,) end = datetime.datetime.now() print "Time to scan accounts to queue emails: %s" % (end - start)
def add_byurl_prefix(): """Run one before the byurl prefix is set, and once after (killing it after it gets when it started the first time""" from datetime import datetime from r2.models import Link from r2.lib.filters import _force_utf8 from pylons import g from r2.lib.utils import fetch_things2 from r2.lib.db.operators import desc from r2.lib.utils import base_url now = datetime.now(g.tz) print 'started at %s' % (now,) l_q = Link._query( Link.c._date < now, data=True, sort=desc('_date')) # from link.py def by_url_key(url, prefix=''): s = _force_utf8(base_url(url.lower())) return '%s%s' % (prefix, s) done = 0 for links in fetch_things2(l_q, 1000, chunks=True): done += len(links) print 'Doing: %r, %s..%s' % (done, links[-1]._date, links[0]._date) # only links with actual URLs links = filter(lambda link: (not getattr(link, 'is_self', False) and getattr(link, 'url', '')), links) # old key -> new key translate = dict((by_url_key(link.url), by_url_key(link.url, prefix='byurl_')) for link in links) old = g.permacache.get_multi(translate.keys()) new = dict((translate[old_key], value) for (old_key, value) in old.iteritems()) g.permacache.set_multi(new)
def backfill_deleted_accounts(resume_id=None): del_accts = Account._query(Account.c._deleted == True, sort=desc('_date')) if resume_id: del_accts._filter(Account.c._id < resume_id) for i, account in enumerate(progress(fetch_things2(del_accts))): # Don't kill the rabbit! Wait for the relevant queues to calm down. if i % 1000 == 0: del_len = get_queue_length('del_account_q') cs_len = get_queue_length('cloudsearch_changes') while (del_len > 1000 or cs_len > 10000): sys.stderr.write(("CS: %d, DEL: %d" % (cs_len, del_len)) + "\n") sys.stderr.flush() time.sleep(1) del_len = get_queue_length('del_account_q') cs_len = get_queue_length('cloudsearch_changes') amqp.add_item('account_deleted', account._fullname)
def port_deleted_links(after_id=None): from r2.models import Link from r2.lib.db.operators import desc from r2.models.query_cache import CachedQueryMutator from r2.lib.db.queries import get_deleted_links from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc('_date'), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def port_cassasaves(after_id=None, estimate=12489897): from r2.models import SaveHide, CassandraSave from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, to36, progress q = SaveHide._query(SaveHide.c._name == 'save', sort=desc('_date'), data=False, eager_load=False) if after_id is not None: q._after(SaveHide._byID(after_id)) for sh in progress(fetch_things2(q), estimate=estimate): csh = CassandraSave(thing1_id=to36(sh._thing1_id), thing2_id=to36(sh._thing2_id), date=sh._date) csh._commit(write_consistency_level=CL.ONE)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrlAndSubreddit from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: for l in chunk: LinksByUrlAndSubreddit.add_link(l)
def _populate(after_id=None, estimate=54301242): from r2.models import desc from r2.lib.db import tdb_cassandra from r2.lib import utils # larger has a chance to decrease the number of Cassandra writes, # but the probability is low chunk_size = 5000 q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc('_date')) if after_id is not None: q._after(Comment._byID(after_id)) q = utils.fetch_things2(q, chunk_size=chunk_size) q = utils.progress(q, verbosity=chunk_size, estimate=estimate) for chunk in utils.in_chunks(q, chunk_size): chunk = filter(lambda x: hasattr(x, 'link_id'), chunk) update_comment_votes(chunk)
def catch_up_batch_queries(): # catch up on batched_time_times queries that haven't been run # that should be, This should be cronned to run about once an # hour. The more often, the more the work of rerunning the actual # queries is spread out, but every run has a fixed-cost of looking # at every single subreddit sr_q = Subreddit._query(sort=desc('_downs'), data=True) dayago = utils.timeago('1 day') for sr in fetch_things2(sr_q): if hasattr(sr, 'last_valid_vote') and sr.last_valid_vote > dayago: # if we don't know when the last vote was, it couldn't # have been today for sort in batched_time_sorts: for time in batched_time_times: q = make_batched_time_query(sr, sort, time) if q.preflight_check(): # we haven't run the batched_time_times in the # last day add_queries([q]) # make sure that all of the jobs have been completed or processed # by the time we return worker.join()
def get_srmembers(after_user_id): previous_user_id = None while True: # there isn't a good index on rel_id so we need to get a new query # for each batch rather than relying solely on fetch_things2 q = get_query(after_user_id) users_seen = 0 for rel in fetch_things2(q): user_id = rel._thing2_id if user_id != previous_user_id: if users_seen >= 20: # set after_user_id to the previous id so we will pick up # the query at this same point after_user_id = previous_user_id break users_seen += 1 previous_user_id = user_id yield rel