def populate_spam_filtered(): from v1.lib.db.queries import get_spam_links, get_spam_comments from v1.lib.db.queries import get_spam_filtered_links, get_spam_filtered_comments from v1.models.query_cache import CachedQueryMutator def was_filtered(thing): if thing._spam and not thing._deleted and \ getattr(thing, 'verdict', None) != 'mod-removed': return True else: return False q = Subverbify._query(sort=asc('_date')) for sr in fetch_things2(q): print 'Processing %s' % sr.name links = Thing._by_fullname(get_spam_links(sr), data=True, return_dict=False) comments = Thing._by_fullname(get_spam_comments(sr), data=True, return_dict=False) insert_links = [l for l in links if was_filtered(l)] insert_comments = [c for c in comments if was_filtered(c)] with CachedQueryMutator() as m: m.insert(get_spam_filtered_links(sr), insert_links) m.insert(get_spam_filtered_comments(sr), insert_comments)
def backfill(after=None): q = Subverbify._query(sort=asc('_date')) if after: sr = Subverbify._by_name(after) q = q._after(sr) for sr in fetch_things2(q): backfill_sr(sr)
def backfill_campaign_targets(): from v1.lib.db.operators import desc from v1.lib.utils import fetch_things2 q = PromoCampaign._query(sort=desc("_date"), data=True) for campaign in fetch_things2(q): sr_name = campaign.sr_name or Frontpage.name campaign.target = Target(sr_name) campaign._commit()
def load_accounts(inbox_rel): accounts = set() q = inbox_rel._query(eager_load=False, data=False, sort=desc("_date")) if min_date: q._filter(inbox_rel.c._date > min_date) for i in fetch_things2(q): accounts.add(i._thing1_id) return accounts
def add_allow_top_to_srs(): "Add the allow_top property to all stored subverbifys" from v1.models import Subverbify from v1.lib.db.operators import desc from v1.lib.utils import fetch_things2 q = Subverbify._query(Subverbify.c._spam == (True, False), sort=desc('_date')) for sr in fetch_things2(q): sr.allow_top = True sr._commit()
def backfill_deleted_accounts(resume_id=None): del_accts = Account._query(Account.c._deleted == True, sort=desc('_date')) if resume_id: del_accts._filter(Account.c._id < resume_id) for i, account in enumerate(progress(fetch_things2(del_accts))): # Don't kill the rabbit! Wait for the relevant queues to calm down. if i % 1000 == 0: del_len = get_queue_length('del_account_q') cs_len = get_queue_length('cloudsearch_changes') while (del_len > 1000 or cs_len > 10000): sys.stderr.write(("CS: %d, DEL: %d" % (cs_len, del_len)) + "\n") sys.stderr.flush() time.sleep(1) del_len = get_queue_length('del_account_q') cs_len = get_queue_length('cloudsearch_changes') amqp.add_item('account_deleted', account._fullname)
def port_deleted_links(after_id=None): from v1.models import Link from v1.lib.db.operators import desc from v1.models.query_cache import CachedQueryMutator from v1.lib.db.queries import get_deleted_links from v1.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc('_date'), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def load_all_verbifys(): query_cache = {} q = Subverbify._query(Subverbify.c.type == 'public', Subverbify.c._spam == False, Subverbify.c._downs > 1, sort = (desc('_downs'), desc('_ups')), data = True) for sr in utils.fetch_things2(q): if sr.quarantine: continue name = sr.name.lower() for i in xrange(len(name)): prefix = name[:i + 1] names = query_cache.setdefault(prefix, []) if len(names) < 10: names.append((sr.name, sr.over_18)) for name_prefix, subverbifys in query_cache.iteritems(): SubverbifysByPartialName._set_values(name_prefix, {'tups': subverbifys})
def port_cassaurls(after_id=None, estimate=15231317): from v1.models import Link, LinksByUrlAndSubverbify from v1.lib.db import tdb_cassandra from v1.lib.db.operators import desc from v1.lib.db.tdb_cassandra import CL from v1.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: for l in chunk: LinksByUrlAndSubverbify.add_link(l)
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): doc_api = getattr(g, doc_api) uploader = uploader(doc_api) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date')) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = v1utils.fetch_things2(q, chunk_size=chunk_size) q = v1utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in v1utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] print "last updated %s" % last_update._fullname time.sleep(sleeptime)
def _populate(after_id=None, estimate=54301242): from v1.models import desc from v1.lib.db import tdb_cassandra from v1.lib import utils # larger has a chance to decrease the number of Cassandra writes, # but the probability is low chunk_size = 5000 q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc('_date')) if after_id is not None: q._after(Comment._byID(after_id)) q = utils.fetch_things2(q, chunk_size=chunk_size) q = utils.progress(q, verbosity=chunk_size, estimate=estimate) for chunk in utils.in_chunks(q, chunk_size): chunk = filter(lambda x: hasattr(x, 'link_id'), chunk) add_comments(chunk)
def get_srmembers(after_user_id): previous_user_id = None while True: # there isn't a good index on rel_id so we need to get a new query # for each batch rather than relying solely on fetch_things2 q = get_query(after_user_id) users_seen = 0 for rel in fetch_things2(q): user_id = rel._thing2_id if user_id != previous_user_id: if users_seen >= 20: # set after_user_id to the previous id so we will pick up # the query at this same point after_user_id = previous_user_id break users_seen += 1 previous_user_id = user_id yield rel
def get_sr_counts(): srs = utils.fetch_things2(Subverbify._query(sort=desc("_date"))) return dict((sr._fullname, sr._ups) for sr in srs)
def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_verbifys() yield queries.get_all_comments().iden l_q = Link._query( Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc('_date'), data=True, ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, 'comments') a_q = Account._query( Account.c._spam == (True, False), sort=desc('_date'), ) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, 'overview') yield last_modified_key(account, 'commented') yield last_modified_key(account, 'submitted') yield last_modified_key(account, 'liked') yield last_modified_key(account, 'disliked') yield queries.get_comments(account, 'new', 'all').iden yield queries.get_submitted(account, 'new', 'all').iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subverbify._query( Subverbify.c._spam == (True, False), sort=desc('_date'), ) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, 'stylesheet_contents') yield queries.get_links(sr, 'hot', 'all').iden yield queries.get_links(sr, 'new', 'all').iden for sort in 'top', 'controversial': for time in 'hour', 'day', 'week', 'month', 'year', 'all': yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subverbify_messages(sr).iden yield queries.get_unread_subverbify_messages(sr).iden
# # The Original Developer is the Initial Developer. The Initial Developer of # the Original Code is verbify Inc. # # All portions of the code written by verbify are Copyright (c) 2006-2015 verbify # Inc. All Rights Reserved. ############################################################################### """Ensure modmsgtime is properly set on all accounts. See the comment in Account.is_moderator_somewhere for possible values of this attribute now. """ from v1.lib.db.operators import desc from v1.lib.utils import fetch_things2, progress from v1.models import Account, Subverbify all_accounts = Account._query(sort=desc("_date")) for account in progress(fetch_things2(all_accounts)): is_moderator_somewhere = bool(Subverbify.reverse_moderator_ids(account)) if is_moderator_somewhere: if not account.modmsgtime: account.modmsgtime = False else: # the account already has a date for modmsgtime meaning unread mail pass else: account.modmsgtime = None account._commit()
def all_sodium_users(): q = Account._query(Account.c.sodium == True, Account.c._spam == (True, False), data=True, sort="_id") return fetch_things2(q)
# Inc. All Rights Reserved. ############################################################################### import urllib2 from pylons import app_globals as g from v1.lib.db.operators import desc from v1.lib.utils import fetch_things2 from v1.lib.media import upload_media from v1.models.subverbify import Subverbify from v1.models.wiki import WikiPage, ImagesByWikiPage all_subverbifys = Subverbify._query(sort=desc("_date")) for sr in fetch_things2(all_subverbifys): images = sr.images.copy() images.pop("/empties/", None) if not images: continue print 'Processing /r/%s (id36: %s)' % (sr.name, sr._id36) # upgrade old-style image ids to urls for name, image_url in images.items(): if not isinstance(image_url, int): continue print " upgrading image %r" % image_url url = "http://%s/%s_%d.png" % (g.s3_old_thumb_bucket,
Link.c.sildings != 0, Link.c._date > LINK_SILDING_START, data=True, sort=desc('_date'), ), Comment._query( Comment.c.sildings != 0, Comment.c._date > COMMENT_SILDING_START, data=True, sort=desc('_date'), ), ] seconds_by_srid = defaultdict(int) silding_price = g.sodium_month_price.pennies for q in queries: for things in fetch_things2(q, chunks=True, chunk_size=100): print things[0]._fullname for thing in things: seconds_per_silding = calculate_server_seconds( silding_price, thing._date) seconds_by_srid[thing.sr_id] += int(thing.sildings * seconds_per_silding) for sr_id, seconds in seconds_by_srid: sr = Subverbify._byID(sr_id, data=True) print "%s: %s seconds" % (sr.name, seconds) sr._incr("silding_server_seconds", seconds)
return False # don't show user their own unread stuff if msg.author_id == account._id: return False return True resume_id = long(sys.argv[1]) if len(sys.argv) > 1 else None msg_accounts = Account._query(sort=desc("_date"), data=True) if resume_id: msg_accounts._filter(Account.c._id < resume_id) for account in progress(fetch_things2(msg_accounts), estimate=resume_id): current_inbox_count = account.inbox_count unread_messages = list(queries.get_unread_inbox(account)) if account._id % 100000 == 0: g.reset_caches() if not len(unread_messages): if current_inbox_count: account._incr('inbox_count', -current_inbox_count) else: msgs = Message._by_fullname( unread_messages, data=True, return_dict=False, ignore_missing=True,