def get_links(sr, sort, time, merge_batched=True): """General link query for a subreddit.""" q = Link._query(Link.c.sr_id == sr._id, sort = db_sort(sort), data = True) if time != 'all': q._filter(db_times[time]) res = make_results(q) # see the discussion above batched_time_times if (merge_batched and g.use_query_cache and sort in batched_time_sorts and time in batched_time_times): byday = Link._query(Link.c.sr_id == sr._id, sort = db_sort(sort), data=True) byday._filter(db_times['day']) res = merge_results(res, make_results(byday)) return res
def get_links(self, sort, time): from r2.lib.db import queries from r2.models import Link from r2.controllers.errors import UserRequiredException if not c.user_is_loggedin: raise UserRequiredException friends = self.get_important_friends(c.user._id) if not friends: return [] if g.use_query_cache: # with the precomputer enabled, this Subreddit only supports # being sorted by 'new'. it would be nice to have a # cleaner UI than just blatantly ignoring their sort, # though sort = 'new' time = 'all' friends = Account._byID(friends, return_dict=False) crs = [ queries.get_submitted(friend, sort, time) for friend in friends ] return queries.MergedCachedResults(crs) else: q = Link._query(Link.c.author_id == friends, sort=queries.db_sort(sort), data=True) if time != 'all': q._filter(queries.db_times[time]) return q
def get_spam_filtered_links(sr_id): """ NOTE: This query will never run unless someone does an "update" on it, but that will probably timeout. Use insert_spam_filtered_links.""" return Link._query(Link.c.sr_id == sr_id, Link.c._spam == True, Link.c.verdict != 'mod-removed', sort=db_sort('new'))
def make_daily_promotions(): # charge campaigns so they can go live charge_pending(offset=0) charge_pending(offset=1) # promote links and record ids of promoted links link_ids = set() for campaign, link in get_scheduled_promos(offset=0): link_ids.add(link._id) promote_link(link, campaign) # expire finished links q = Link._query(Link.c.promote_status == PROMOTE_STATUS.promoted, data=True) q = q._filter(not_(Link.c._id.in_(link_ids))) for link in q: update_promote_status(link, PROMOTE_STATUS.finished) emailer.finished_promo(link) # update subreddits with promos all_live_promo_srnames(_update=True) _mark_promos_updated() finalize_completed_campaigns(daysago=1) hooks.get_hook('promote.make_daily_promotions').call(offset=0)
def test_cassasavehide(): from r2.models import Account, Link, CassandraSave, SavesByAccount from r2.lib.db import tdb_cassandra a = list(Account._query(sort=desc('_date'), limit=1))[0] l = list(Link._query(sort=desc('_date'), limit=1))[0] try: csh = CassandraSave._fast_query(a._id36, l._id36) print "Warning! Deleting!", csh CassandraSave._fast_query(a._id36, l._id36)._destroy() except tdb_cassandra.NotFound: pass csh = CassandraSave._save(a, l) csh._commit() assert CassandraSave._fast_query(a._id36, l._id36) == csh # check for the SavesByAccount object too assert SavesByAccount._byID(a._id36)[csh._id] == csh._id csh._destroy() try: CassandraSave._fast_query(a._id36, l._id36) == csh raise Exception("shouldn't exist after destroying") except tdb_cassandra.NotFound: pass try: assert csh._id not in SavesByAccount._byID( a._id36, properties=csh._id)._values() except tdb_cassandra.NotFound: pass
def run(verbose=True, sleep_time = 60, num_items = 1): key = "indextank_cursor" cursor = g.cache.get(key) if cursor is None: raise ValueError("%s is not set!" % key) cursor = int(cursor) while True: if verbose: print "Looking for %d items with _id < %d" % (num_items, cursor) q = Link._query(sort = desc('_id'), limit = num_items) q._after(Link._byID(cursor)) last_date = None for item in q: cursor = item._id last_date = item._date amqp.add_item('indextank_changes', item._fullname, message_id = item._fullname, delivery_mode = amqp.DELIVERY_TRANSIENT) g.cache.set(key, cursor) if verbose: if last_date: last_date = last_date.strftime("%Y-%m-%d") print ("Just enqueued %d items. New cursor=%s (%s). Sleeping %d seconds." % (num_items, cursor, last_date, sleep_time)) sleep(sleep_time)
def test_cassasavehide(): from r2.models import Account, Link, CassandraSave, SavesByAccount from r2.lib.db import tdb_cassandra a = list(Account._query(sort=desc('_date'), limit=1))[0] l = list(Link._query(sort=desc('_date'), limit=1))[0] try: csh = CassandraSave._fast_query(a._id36, l._id36) print "Warning! Deleting!", csh CassandraSave._fast_query(a._id36, l._id36)._destroy() except tdb_cassandra.NotFound: pass csh = CassandraSave._save(a, l) csh._commit() assert CassandraSave._fast_query(a._id36, l._id36) == csh # check for the SavesByAccount object too assert SavesByAccount._byID(a._id36)[csh._id] == csh._id csh._destroy() try: CassandraSave._fast_query(a._id36, l._id36) == csh raise Exception("shouldn't exist after destroying") except tdb_cassandra.NotFound: pass try: assert csh._id not in SavesByAccount._byID(a._id36, properties = csh._id)._values() except tdb_cassandra.NotFound: pass
def run(verbose=True, sleep_time=60, num_items=1): key = "indextank_cursor" cursor = g.cache.get(key) if cursor is None: raise ValueError("%s is not set!" % key) cursor = int(cursor) while True: if verbose: print "Looking for %d items with _id < %d" % (num_items, cursor) q = Link._query(sort=desc('_id'), limit=num_items) q._after(Link._byID(cursor)) last_date = None for item in q: cursor = item._id last_date = item._date amqp.add_item('indextank_changes', item._fullname, message_id=item._fullname, delivery_mode=amqp.DELIVERY_TRANSIENT) g.cache.set(key, cursor) if verbose: if last_date: last_date = last_date.strftime("%Y-%m-%d") print( "Just enqueued %d items. New cursor=%s (%s). Sleeping %d seconds." % (num_items, cursor, last_date, sleep_time)) sleep(sleep_time)
def get_hot(sr): q = Link._query(Link.c.sr_id == sr._id, sort = desc('_hot'), write_cache = True, limit = 150) iden = q._iden() read_cache = True #if query is in the cache, the expire flag is true, and the access #time is old, set read_cache = False if cache.get(iden) is not None: if cache.get(expire_key(sr)): access_time = cache.get(access_key(sr)) if not access_time or datetime.now() > access_time + expire_delta: cache.delete(expire_key(sr)) read_cache = False #if the query isn't in the cache, set read_cache to false so we #record the access time else: read_cache = False if not read_cache: cache.set(access_key(sr), datetime.now()) q._read_cache = read_cache res = list(q) #set the #1 link so we can ignore it later. expire after TOP_CACHE #just in case something happens and that sr doesn't update if res: cache.set(top_key(sr), res[0]._fullname, TOP_CACHE) return res
def get_links(self, sort, time): from r2.models import Link from r2.lib.db import queries q = Link._query(sort = queries.db_sort(sort)) if time != 'all': q._filter(queries.db_times[time]) return q
def get_spam_filtered_links(sr_id): """ NOTE: This query will never run unless someone does an "update" on it, but that will probably timeout. Use insert_spam_filtered_links.""" return Link._query(Link.c.sr_id == sr_id, Link.c._spam == True, Link.c.verdict != 'mod-removed', sort = db_sort('new'))
def get_links(self, sort, time): from r2.lib.db import queries from r2.models import Link from r2.controllers.errors import UserRequiredException if not c.user_is_loggedin: raise UserRequiredException friends = self.get_important_friends(c.user._id) if not friends: return [] if g.use_query_cache: # with the precomputer enabled, this Subreddit only supports # being sorted by 'new'. it would be nice to have a # cleaner UI than just blatantly ignoring their sort, # though sort = "new" time = "all" friends = Account._byID(friends, return_dict=False) crs = [queries.get_submitted(friend, sort, time) for friend in friends] return queries.MergedCachedResults(crs) else: q = Link._query(Link.c.author_id == friends, sort=queries.db_sort(sort), data=True) if time != "all": q._filter(queries.db_times[time]) return q
def import_missing_comments(filename, apply_changes=False): """Imports the comments from the supplied YAML""" missing_comments = yaml.load(open(filename), Loader=yaml.CLoader) global dryrun dryrun = not apply_changes total_posts = len(missing_comments) post_count = 0 for post in missing_comments: if post['author'] != 'Eliezer Yudkowsky': # print "Skipping non-EY post (%s): %s" % (post['author'], post['permalink']) continue ob_permalink = adjust_permalink(post['permalink']) # Attempt to retrieve the post that was imported into Less Wrong imported_post = list(Link._query(Link.c.ob_permalink == ob_permalink, data=True)) if len(imported_post) < 1: print "Unable to retrieve imported post: %s" % ob_permalink continue elif len(imported_post) > 1: print "Got more than one result for: %s" % ob_permalink raise Exception else: imported_post = imported_post[0] post_count += 1 try: print "Importing (%d of %d) comments on: %s" % (post_count, total_posts, imported_post.canonical_url) except UnicodeError: print "Importing comments on post (%d of %d)" process_comments_on_post(imported_post, post['comments'])
def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_reddits() yield queries.get_all_comments().iden l_q = Link._query(Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc('_date'), data=True, ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, 'comments') a_q = Account._query(Account.c._spam == (True, False), sort=desc('_date'), ) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, 'overview') yield last_modified_key(account, 'commented') yield last_modified_key(account, 'submitted') yield last_modified_key(account, 'liked') yield last_modified_key(account, 'disliked') yield queries.get_comments(account, 'new', 'all').iden yield queries.get_submitted(account, 'new', 'all').iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subreddit._query(Subreddit.c._spam == (True, False), sort=desc('_date'), ) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, 'stylesheet_contents') yield queries.get_links(sr, 'hot', 'all').iden yield queries.get_links(sr, 'new', 'all').iden for sort in 'top', 'controversial': for time in 'hour', 'day', 'week', 'month', 'year', 'all': yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subreddit_messages(sr).iden yield queries.get_unread_subreddit_messages(sr).iden
def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_reddits() yield queries.get_all_comments().iden l_q = Link._query(Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc('_date'), data=True, ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, 'comments') a_q = Account._query(Account.c._spam == (True, False), sort=desc('_date'), ) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, 'overview') yield last_modified_key(account, 'commented') yield last_modified_key(account, 'submitted') yield last_modified_key(account, 'liked') yield last_modified_key(account, 'disliked') yield queries.get_comments(account, 'new', 'all').iden yield queries.get_submitted(account, 'new', 'all').iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subreddit._query(Subreddit.c._spam == (True, False), sort=desc('_date'), ) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, 'stylesheet_contents') yield queries.get_links(sr, 'hot', 'all').iden yield queries.get_links(sr, 'new', 'all').iden for sort in 'top', 'controversial': for time in 'hour', 'day', 'week', 'month', 'year', 'all': yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subreddit_messages(sr).iden yield queries.get_unread_subreddit_messages(sr).iden
def _query_post(self, *args): post = None kwargs = {'data': True} q = Link._query(*args, **kwargs) posts = list(q) if posts: post = posts[0] return post
def _query_post(self, *args): post = None kwargs = {'data': True} q = Link._query(*args, **kwargs) posts = list(q) if posts: post = posts[0] return post
def get_domain_links(domain, sort, time): from r2.lib.db import operators q = Link._query(operators.domain(Link.c.url) == filters._force_utf8(domain), sort=db_sort(sort), data=True) if time != "all": q._filter(db_times[time]) return make_results(q)
def shorten_byurl_keys(): """We changed by_url keys from a format like byurl_google.com... to: byurl(1d5920f4b44b27a802bd77c4f0536f5a, google.com...) so that they would fit in memcache's 251-char limit """ from datetime import datetime from hashlib import md5 from r2.models import Link from r2.lib.filters import _force_utf8 from pylons import g from r2.lib.utils import fetch_things2, in_chunks from r2.lib.db.operators import desc from r2.lib.utils import base_url, progress # from link.py def old_by_url_key(url): prefix = 'byurl_' s = _force_utf8(base_url(url.lower())) return '%s%s' % (prefix, s) def new_by_url_key(url): maxlen = 250 template = 'byurl(%s,%s)' keyurl = _force_utf8(base_url(url.lower())) hexdigest = md5(keyurl).hexdigest() usable_len = maxlen - len(template) - len(hexdigest) return template % (hexdigest, keyurl[:usable_len]) verbosity = 1000 l_q = Link._query(Link.c._spam == (True, False), data=True, sort=desc('_date')) for links in (in_chunks( progress( fetch_things2(l_q, verbosity), key=lambda link: link._date, verbosity=verbosity, estimate=int(9.9e6), persec=True, ), verbosity)): # only links with actual URLs links = filter( lambda link: (not getattr(link, 'is_self', False) and getattr(link, 'url', '')), links) # old key -> new key translate = dict((old_by_url_key(link.url), new_by_url_key(link.url)) for link in links) old = g.permacache.get_multi(translate.keys()) new = dict((translate[old_key], value) for (old_key, value) in old.iteritems()) g.permacache.set_multi(new)
def get_domain_links(domain, sort, time): from r2.lib.db import operators q = Link._query(operators.domain(Link.c.url) == filters._force_utf8(domain), sort = db_sort(sort), data = True) if time != "all": q._filter(db_times[time]) return make_results(q)
def get_links(self, sort, time): from r2.lib import promote from r2.models import Link from r2.lib.db import queries q = Link._query(sort=queries.db_sort(sort), read_cache=True, write_cache=True, cache_time=60, data=True) if time != "all": q._filter(queries.db_times[time]) return q
def get_unmoderated_links(sr_id): q = Link._query(Link.c.sr_id == sr_id, Link.c._spam == (True, False), sort = db_sort('new')) # Doesn't really work because will not return Links with no verdict q._filter(or_(and_(Link.c._spam == True, Link.c.verdict != 'mod-removed'), and_(Link.c._spam == False, Link.c.verdict != 'mod-approved'))) return q
def get_unmoderated_links(sr_id): q = Link._query(Link.c.sr_id == sr_id, Link.c._spam == (True, False), sort=db_sort('new')) # Doesn't really work because will not return Links with no verdict q._filter( or_(and_(Link.c._spam == True, Link.c.verdict != 'mod-removed'), and_(Link.c._spam == False, Link.c.verdict != 'mod-approved'))) return q
def _get_links(sr_id, sort, time): """General link query for a subreddit.""" q = Link._query(Link.c.sr_id == sr_id, sort=db_sort(sort), data=True) if time != 'all': q._filter(db_times[time]) res = make_results(q) return res
def _get_links(sr_id, sort, time): """General link query for a subreddit.""" q = Link._query(Link.c.sr_id == sr_id, sort=db_sort(sort), data=True) if time != "all": q._filter(db_times[time]) res = make_results(q) return res
def get_links(sr, sort, time, merge_batched=True): """General link query for a subreddit.""" q = Link._query(Link.c.sr_id == sr._id, sort=db_sort(sort)) if time != 'all': q._filter(db_times[time]) res = make_results(q) # see the discussion above batched_time_times if (merge_batched and g.use_query_cache and sort in batched_time_sorts and time in batched_time_times): byday = Link._query(Link.c.sr_id == sr._id, sort=db_sort(sort)) byday._filter(db_times['day']) res = merge_results(res, make_results(byday)) return res
def get_links(sr, sort, time): """General link query for a subreddit.""" q = Link._query(Link.c.sr_id == sr._id, sort=db_sort(sort)) if sort == "toplinks": q._filter(Link.c.top_link == True) if time != "all": q._filter(db_times[time]) return make_results(q)
def get_links(self, sort, time): from r2.lib import promote from r2.models import Link from r2.lib.db import queries q = Link._query(sort = queries.db_sort(sort), read_cache = True, write_cache = True, cache_time = 60) if time != 'all': q._filter(queries.db_times[time]) return q
def get_links(sr, sort, time, merge_batched=True): """General link query for a subreddit.""" q = Link._query(Link.c.sr_id == sr._id, sort = db_sort(sort), data = True) if time != 'all': q._filter(db_times[time]) res = make_results(q) return res
def get_links(self, sort, time): from r2.lib.db import queries from r2.models import Link from r2.controllers.errors import UserRequiredException if not c.user_is_loggedin: raise UserRequiredException q = Link._query(Link.c.author_id == c.user.friends, sort=queries.db_sort(sort)) if time != "all": q._filter(queries.db_times[time]) return q
def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_reddits() yield queries.get_all_comments().iden l_q = Link._query( Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc("_date"), data=True ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, "comments") a_q = Account._query(Account.c._spam == (True, False), sort=desc("_date")) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, "overview") yield last_modified_key(account, "commented") yield last_modified_key(account, "submitted") yield last_modified_key(account, "liked") yield last_modified_key(account, "disliked") yield queries.get_comments(account, "new", "all").iden yield queries.get_submitted(account, "new", "all").iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subreddit._query(Subreddit.c._spam == (True, False), sort=desc("_date")) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, "stylesheet_contents") yield queries.get_links(sr, "hot", "all").iden yield queries.get_links(sr, "new", "all").iden for sort in "top", "controversial": for time in "hour", "day", "week", "month", "year", "all": yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subreddit_messages(sr).iden yield queries.get_unread_subreddit_messages(sr).iden
def get_links(self, sort, time): from r2.lib import promote from r2.models import Link from r2.lib.db import queries q = Link._query(Link.c.sr_id > 0, sort = queries.db_sort(sort), read_cache = True, write_cache = True, cache_time = 60, data = True, filter_primary_sort_only=True) if time != 'all': q._filter(queries.db_times[time]) return q
def get_links(self, sort, time): from r2.lib import promote from r2.models import Link from r2.lib.db import queries q = Link._query(Link.c.sr_id > 0, sort=queries.db_sort(sort), read_cache=True, write_cache=True, cache_time=60, data=True, filter_primary_sort_only=True) if time != 'all': q._filter(queries.db_times[time]) return q
def write_all_hot_cache(): from r2.models.link import Link from r2.lib.db import queries q = Link._query( sort=queries.db_sort('hot'), limit=NUM_LINKS, ) top_links = resort_links(list(q)) link_ids = [link._fullname for link in top_links] g.gencache.set(CACHE_KEY, link_ids) return link_ids
def default_queries(): from r2.models import Link, Subreddit from r2.lib.db.operators import desc from copy import deepcopy queries = [] q = Link._query(Link.c.sr_id == Subreddit.user_subreddits(None), sort=desc("_hot"), limit=37) queries.append(q) # add a higher limit one too q = deepcopy(q) q._limit = 75 queries.append(q) return queries
def port_deleted_links(after_id=None): from r2.models import Link from r2.lib.db.operators import desc from r2.models.query_cache import CachedQueryMutator from r2.lib.db.queries import get_deleted_links from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc("_date"), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def import_into_subreddit(self, sr, data, rewrite_map_file): posts = list(Link._query()) for post in posts: post._delete_from_db() comments = self._query_comments() for comment in comments: comment._delete_from_db() for post_data in data: try: print post_data['title'] self.process_post(post_data, sr) except Exception, e: print 'Unable to create post:\n%s\n%s\n%s' % (type(e), e, post_data) raise
def get_links_sr_ids(self, sr_ids, sort, time): from r2.lib.db import queries from r2.models import Link if not sr_ids: return [] else: srs = Subreddit._byID(sr_ids, data=True, return_dict=False) if g.use_query_cache: results = [queries.get_links(sr, sort, time) for sr in srs] return queries.merge_results(*results) else: q = Link._query(Link.c.sr_id == sr_ids, sort=queries.db_sort(sort), data=True) if time != "all": q._filter(queries.db_times[time]) return q
def import_into_subreddit(self, sr, data, rewrite_map_file): posts = list(Link._query()) for post in posts: post._delete_from_db() comments = self._query_comments() for comment in comments: comment._delete_from_db() for post_data in data: try: print post_data['title'] self.process_post(post_data, sr) except Exception, e: print 'Unable to create post:\n%s\n%s\n%s' % (type(e), e, post_data) raise
def default_queries(): from r2.models import Link, Subreddit from r2.lib.db.operators import desc from copy import deepcopy queries = [] q = Link._query(Link.c.sr_id == Subreddit.user_subreddits(None), sort=desc('_hot'), limit=37) queries.append(q) #add a higher limit one too q = deepcopy(q) q._limit = 75 queries.append(q) return queries
def get_links_sr_ids(self, sr_ids, sort, time): from r2.lib.db import queries from r2.models import Link if not sr_ids: return [] else: srs = Subreddit._byID(sr_ids, return_dict=False) if g.use_query_cache: results = [queries.get_links(sr, sort, time) for sr in srs] return queries.merge_results(*results) else: q = Link._query(Link.c.sr_id == sr_ids, sort=queries.db_sort(sort)) if time != 'all': q._filter(queries.db_times[time]) return q
def add_byurl_prefix(): """Run one before the byurl prefix is set, and once after (killing it after it gets when it started the first time""" from datetime import datetime from r2.models import Link from r2.lib.filters import _force_utf8 from pylons import g from r2.lib.utils import fetch_things2 from r2.lib.db.operators import desc from r2.lib.utils import base_url now = datetime.now(g.tz) print 'started at %s' % (now,) l_q = Link._query( Link.c._date < now, data=True, sort=desc('_date')) # from link.py def by_url_key(url, prefix=''): s = _force_utf8(base_url(url.lower())) return '%s%s' % (prefix, s) done = 0 for links in fetch_things2(l_q, 1000, chunks=True): done += len(links) print 'Doing: %r, %s..%s' % (done, links[-1]._date, links[0]._date) # only links with actual URLs links = filter(lambda link: (not getattr(link, 'is_self', False) and getattr(link, 'url', '')), links) # old key -> new key translate = dict((by_url_key(link.url), by_url_key(link.url, prefix='byurl_')) for link in links) old = g.permacache.get_multi(translate.keys()) new = dict((translate[old_key], value) for (old_key, value) in old.iteritems()) g.permacache.set_multi(new)
def port_deleted_links(after_id=None): from r2.models import Link from r2.lib.db.operators import desc from r2.models.query_cache import CachedQueryMutator from r2.lib.db.queries import get_deleted_links from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc('_date'), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def _promoted_link_query(user_id, status): STATUS_CODES = { "unpaid": PROMOTE_STATUS.unpaid, "unapproved": PROMOTE_STATUS.unseen, "rejected": PROMOTE_STATUS.rejected, "live": PROMOTE_STATUS.promoted, "accepted": (PROMOTE_STATUS.accepted, PROMOTE_STATUS.pending, PROMOTE_STATUS.finished), } q = Link._query( Link.c.sr_id == get_promote_srid(), Link.c._spam == (True, False), Link.c._deleted == (True, False), Link.c.promote_status == STATUS_CODES[status], sort=db_sort("new"), ) if user_id: q._filter(Link.c.author_id == user_id) return q
def fix_images(dryrun=True): from r2.models import Link, Comment links = Link._query(Link.c.ob_permalink != None, data=True) for link in links: ob_url = link.ob_permalink.strip() print "Processing %s" % ob_url new_content = process_content(link.article) if not dryrun: link.article = new_content link._commit() comments = Comment._query(Comment.c.link_id == link._id, data=True) for comment in comments: new_content = process_content(comment.body) if not dryrun: comment.body = new_content comment._commit()
def fix_images(dryrun=True): from r2.models import Link, Comment links = Link._query(Link.c.ob_permalink != None, data = True) for link in links: ob_url = link.ob_permalink.strip() print "Processing %s" % ob_url new_content = process_content(link.article) if not dryrun: link.article = new_content link._commit() comments = Comment._query(Comment.c.link_id == link._id, data = True) for comment in comments: new_content = process_content(comment.body) if not dryrun: comment.body = new_content comment._commit()
def get_all_query(sort, time): """ Return a Query for r/all links sorted by anything other than Hot, which has special treatment.""" from r2.models import Link from r2.lib.db import queries q = Link._query( sort=queries.db_sort(sort), read_cache=True, write_cache=True, cache_time=60, data=True, filter_primary_sort_only=True, ) if time != 'all': q._filter(queries.db_times[time]) return q
def _get_links(sr_id, sort, time,no_children=False): """General link query for a subsciteit.""" #Get the children if there are any... from r2.lib.normalized_hot import expand_children #Are we building a lot of them? if not no_children: srs = expand_children(sr_id,byID=True) results = [_get_links(sr_id,sort,time,no_children=True) for sr_id in srs] return merge_results(*results) q = Link._query(Link.c.sr_id == sr_id, sort = db_sort(sort), data = True) if time != 'all': q._filter(db_times[time]) res = make_results(q) return res
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrl from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc("_date"), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, "url", "self") != "self" and not getattr(l, "is_self", False)) chunks = in_chunks(q, 500) for chunk in chunks: with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b: for l in chunk: k = LinksByUrl._key_from_url(l.url) if k: b.insert(k, {l._id36: l._id36})
def spam_account_links(self, account, query_limit=10000, spam_limit=500): from r2.lib.db.operators import asc, desc, timeago q = Link._query(Link.c.author_id == account._id, Link.c._spam == False, sort=desc('_date'), data=False) q._limit = query_limit things = list(q) processed = 0 for item in things: if processed < spam_limit: verdict = getattr(item, "verdict", None) if not verdict or not verdict.endswith("-approved"): processed += 1 admintools.spam(item, auto=False, moderator_banned=False, banner=None, train_spam=True)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrlAndSubreddit from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: for l in chunk: LinksByUrlAndSubreddit.add_link(l)
def _post_process(self, rewrite_map_file): def unicode_safe(text): if isinstance(text, unicode): return text.encode('utf-8') else: return text posts = list(Link._query(Link.c.ob_permalink != None, data = True)) # Generate a mapping between ob permalinks and imported posts self.post_mapping = {} for post in posts: self.post_mapping[post.ob_permalink] = post # Write out the rewrite map for old_url, post in self.post_mapping.iteritems(): ob_url = urlparse.urlparse(old_url) new_url = post.canonical_url try: rewrite_map_file.write("%s %s\n" % (unicode_safe(ob_url.path), unicode_safe(new_url))) except UnicodeEncodeError, uee: print "Unable to write to rewrite map file:" print unicode_safe(ob_url.path) print unicode_safe(new_url)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrl from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b: for l in chunk: k = LinksByUrl._key_from_url(l.url) if k: b.insert(k, {l._id36: l._id36})
from r2.lib.db.operators import desc from r2.lib.utils import fetch_things2 from r2.models import ( calculate_server_seconds, Comment, Link, Subreddit, ) LINK_GILDING_START = datetime(2014, 2, 1, 0, 0, tzinfo=g.tz) COMMENT_GILDING_START = datetime(2012, 10, 1, 0, 0, tzinfo=g.tz) queries = [ Link._query( Link.c.gildings != 0, Link.c._date > LINK_GILDING_START, data=True, sort=desc('_date'), ), Comment._query( Comment.c.gildings != 0, Comment.c._date > COMMENT_GILDING_START, data=True, sort=desc('_date'), ), ] seconds_by_srid = defaultdict(int) gilding_price = g.gold_month_price.pennies for q in queries: for things in fetch_things2(q, chunks=True, chunk_size=100):
def get_reported_links(sr): q_l = Link._query(Link.c.reported != 0, Link.c.sr_id == sr._id, Link.c._spam == False, sort=db_sort('new')) return make_results(q_l)
def get_spam_links(sr): q_l = Link._query(Link.c.sr_id == sr._id, Link.c._spam == True, sort=db_sort('new')) return make_results(q_l)