def update_flair_counts(): flairs = Counter() user_ids = [] sr = Subreddit._byID(g.live_config["thebutton_srid"], data=True) raw = AccountsActiveBySR._cf.xget(sr._id36) for uid, _ in raw: user_ids.append(uid) for user_chunk in in_chunks(user_ids, size=100): users = Account._byID36(user_chunk, data=True, return_dict=False) for user in users: flair = user.flair_css_class(sr._id) if not flair: if user._date < ACCOUNT_CREATION_CUTOFF: flair = "no-press" else: flair = "cant-press" flairs[flair] += 1 if 'cheater' in flairs: del flairs['cheater'] sr.flair_counts = sorted( flairs.iteritems(), key=lambda x: 'z' if x[0] == 'no-press' else x[0], reverse=True) sr._commit()
def update_flair_counts(): flairs = Counter() user_ids = [] sr = Subreddit._byID(g.live_config["thebutton_srid"], data=True) raw = [ba._id36 for ba in ButtonActivity._all()] for user_chunk in in_chunks(user_ids, size=100): users = Account._byID36(user_chunk, data=True, return_dict=False) for user in users: flair = user.flair_css_class(sr._id) if not flair: if user._date < ACCOUNT_CREATION_CUTOFF: flair = "no-press" else: flair = "cant-press" flairs[flair] += 1 if 'cheater' in flairs: del flairs['cheater'] sr.flair_counts = sorted(flairs.iteritems(), key=lambda x: 'z' if x[0] == 'no-press' else x[0], reverse=True) sr._commit()
def _rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=SolrLinkUploader, estimate=50000000, chunk_size=1000): uploader = uploader() q = cls._query(cls.c._deleted == (True, False), sort=desc('_date')) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): uploader.things = chunk uploader.fullnames = [c._fullname for c in chunk] for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] print "last updated %s" % last_update._fullname time.sleep(sleeptime)
def rebuild_index(start_at=None, sleeptime=1, cls=Link, estimate=50000000, chunk_size=1000): if start_at is _REBUILD_INDEX_CACHE_KEY: start_at = g.cache.get(start_at) if not start_at: raise ValueError("Told me to use '%s' key, but it's not set" % _REBUILD_INDEX_CACHE_KEY) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'), data=True) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): for x in range(5): try: inject(chunk) except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] g.cache.set(_REBUILD_INDEX_CACHE_KEY, last_update._fullname) time.sleep(sleeptime)
def update_activity(): events = {} event_counts = collections.Counter() query = (ev for ev in LiveUpdateEvent._all() if ev.state == "live" and not ev.banned) for chunk in utils.in_chunks(query, size=100): context_ids = {ev._fullname: ev._id for ev in chunk} view_countable = [ev._fullname for ev in chunk if ev._date >= g.liveupdate_min_date_viewcounts] view_counts_query = ViewCountsQuery.execute_async(view_countable) try: with c.activity_service.retrying(attempts=4) as svc: infos = svc.count_activity_multi(context_ids.keys()) except TTransportException: continue view_counts = view_counts_query.result() for context_id, info in infos.iteritems(): event_id = context_ids[context_id] try: LiveUpdateActivityHistoryByEvent.record_activity( event_id, info.count) except tdb_cassandra.TRANSIENT_EXCEPTIONS as e: g.log.warning("Failed to update activity history for %r: %s", event_id, e) try: event = LiveUpdateEvent.update_activity( event_id, info.count, info.is_fuzzed) except tdb_cassandra.TRANSIENT_EXCEPTIONS as e: g.log.warning("Failed to update event activity for %r: %s", event_id, e) else: events[event_id] = event event_counts[event_id] = info.count websockets.send_broadcast( "/live/" + event_id, type="activity", payload={ "count": info.count, "fuzzed": info.is_fuzzed, "total_views": view_counts.get(context_id), }, ) top_event_ids = [event_id for event_id, count in event_counts.most_common(1000)] top_events = [events[event_id] for event_id in top_event_ids] query_ttl = datetime.timedelta(days=3) with CachedQueryMutator() as m: m.replace(get_active_events(), top_events, ttl=query_ttl) # ensure that all the amqp messages we've put on the worker's queue are # sent before we allow this script to exit. amqp.worker.join()
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): doc_api = getattr(g, doc_api) uploader = uploader(doc_api) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date')) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] print "last updated %s" % last_update._fullname time.sleep(sleeptime)
def activate_names_requested_in(link): tree = get_comment_tree(link) acceptable_names = [] if tree.tree: top_level_cids = tree.tree[None] comments = chain.from_iterable(Comment._byID(chunk, return_dict=False, data=True) for chunk in in_chunks(top_level_cids)) for comment in sorted(comments, key=lambda c: c._ups, reverse=True): if comment._spam or comment._deleted: continue sanitized = comment.body.strip() match = valid_name_re.search(sanitized) if match: acceptable_names.append((comment, match.group(1))) # we activate one name for each 100% of rev goal met names = acceptable_names[:link.revenue_bucket] activate_names(link, names) activated_names = [name for comment, name in names] link.server_names = activated_names link.flair_text = ", ".join(activated_names) if names else "/dev/null" link.flair_css_class = "goal-bucket-%d" % link.revenue_bucket link._commit()
def _generate_sitemaps(links, set_lastmod=True): """Create an iterator of sitemaps. Each sitemap has up to 50000 links, being the maximum allowable number of links according to the sitemap standard. """ for subreddit_chunks in in_chunks(links, LINKS_PER_SITEMAP): yield generate_sitemap_from_links(subreddit_chunks, set_lastmod=set_lastmod)
def subreddit_sitemaps(subreddits): """Create an array of sitemaps. Each sitemap has up to 50000 links, being the maximum allowable number of links according to the sitemap standard. """ for subreddit_chunks in in_chunks(subreddits, LINKS_PER_SITEMAP): yield _subreddit_sitemap(subreddit_chunks)
def shorten_byurl_keys(): """We changed by_url keys from a format like byurl_google.com... to: byurl(1d5920f4b44b27a802bd77c4f0536f5a, google.com...) so that they would fit in memcache's 251-char limit """ from datetime import datetime from hashlib import md5 from r2.models import Link from r2.lib.filters import _force_utf8 from pylons import g from r2.lib.utils import fetch_things2, in_chunks from r2.lib.db.operators import desc from r2.lib.utils import base_url, progress # from link.py def old_by_url_key(url): prefix = 'byurl_' s = _force_utf8(base_url(url.lower())) return '%s%s' % (prefix, s) def new_by_url_key(url): maxlen = 250 template = 'byurl(%s,%s)' keyurl = _force_utf8(base_url(url.lower())) hexdigest = md5(keyurl).hexdigest() usable_len = maxlen - len(template) - len(hexdigest) return template % (hexdigest, keyurl[:usable_len]) verbosity = 1000 l_q = Link._query(Link.c._spam == (True, False), data=True, sort=desc('_date')) for links in (in_chunks( progress( fetch_things2(l_q, verbosity), key=lambda link: link._date, verbosity=verbosity, estimate=int(9.9e6), persec=True, ), verbosity)): # only links with actual URLs links = filter( lambda link: (not getattr(link, 'is_self', False) and getattr(link, 'url', '')), links) # old key -> new key translate = dict((old_by_url_key(link.url), new_by_url_key(link.url)) for link in links) old = g.permacache.get_multi(translate.keys()) new = dict((translate[old_key], value) for (old_key, value) in old.iteritems()) g.permacache.set_multi(new)
def update_activity(): events = {} event_counts = collections.Counter() query = (ev for ev in LiveUpdateEvent._all() if ev.state == "live" and not ev.banned) for chunk in utils.in_chunks(query, size=100): context_ids = {"LiveUpdateEvent_" + ev._id: ev._id for ev in chunk} try: with c.activity_service.retrying(attempts=4) as svc: infos = svc.count_activity_multi(context_ids.keys()) except TTransportException: continue for context_id, info in infos.iteritems(): event_id = context_ids[context_id] try: LiveUpdateActivityHistoryByEvent.record_activity( event_id, info.count) except tdb_cassandra.TRANSIENT_EXCEPTIONS as e: g.log.warning("Failed to update activity history for %r: %s", event_id, e) try: event = LiveUpdateEvent.update_activity( event_id, info.count, info.is_fuzzed) except tdb_cassandra.TRANSIENT_EXCEPTIONS as e: g.log.warning("Failed to update event activity for %r: %s", event_id, e) else: events[event_id] = event event_counts[event_id] = info.count websockets.send_broadcast( "/live/" + event_id, type="activity", payload={ "count": info.count, "fuzzed": info.is_fuzzed, }, ) top_event_ids = [ event_id for event_id, count in event_counts.most_common(1000) ] top_events = [events[event_id] for event_id in top_event_ids] query_ttl = datetime.timedelta(days=3) with CachedQueryMutator() as m: m.replace(get_active_events(), top_events, ttl=query_ttl) # ensure that all the amqp messages we've put on the worker's queue are # sent before we allow this script to exit. amqp.worker.join()
def get_details(cls, thing, voters=None): from r2.models import Comment, Link if isinstance(thing, Link): details_cls = VoteDetailsByLink elif isinstance(thing, Comment): details_cls = VoteDetailsByComment else: raise ValueError voter_id36s = None if voters: voter_id36s = [voter._id36 for voter in voters] try: row = details_cls._byID(thing._id36, properties=voter_id36s) raw_details = row._values() except tdb_cassandra.NotFound: return [] try: row = VoterIPByThing._byID(thing._fullname, properties=voter_id36s) ips = row._values() except tdb_cassandra.NotFound: ips = {} # look up all the accounts in batches of 100 account_id36s = set(raw_details.keys()) accounts = {} for id_chunk in in_chunks(account_id36s, size=100): accounts.update(Account._byID36(id_chunk, data=True)) details = [] for voter_id36, json_data in raw_details.iteritems(): vote_data = json.loads(json_data) vote_data = cls.convert_old_details(vote_data) extra_data = vote_data["data"] extra_data["ip"] = ips.get(voter_id36) vote = Vote( user=accounts[voter_id36], thing=thing, direction=Vote.deserialize_direction(vote_data["direction"]), date=datetime.utcfromtimestamp(vote_data["date"]), data=extra_data, effects=vote_data["effects"], get_previous_vote=False, ) details.append(vote) details.sort(key=lambda d: d.date) return details
def bulk_load(self, start='', end='', chunk_size=100): """Try to load everything out of Cassandra and put it into memcached""" cf = self.cassa.cf for rows in in_chunks( cf.get_range(start=start, finish=end, columns=['value']), chunk_size): print rows[0][0] rows = dict((key, pickle.loads(cols['value'])) for (key, cols) in rows if (cols # hack and len(key) < 250)) self.memcache.set_multi(rows)
def rebuild_index(after_id = None): cls = Link # don't pull spam/deleted q = cls._query(sort=desc('_date'), data=True) if after_id: q._after(cls._byID(after_id)) q = fetch_things2(q) q = progress(q, verbosity=1000, estimate=10000000, persec=True) for chunk in in_chunks(q): inject(chunk)
def simple_get_multi(self, keys): results = {} category_bundles = {} for key in keys: category, ids = self._split_key(key) category_bundles.setdefault(category, []).append(ids) for category in category_bundles: idses = category_bundles[category] chunks = in_chunks(idses, size=50) for chunk in chunks: new_results = self.backend.get_multi(category, chunk) results.update(new_results) return results
def give_trophies(users): for fullnames in in_chunks(progress(users, verbosity=50), size=50): users = Account._by_fullname(fullnames, return_dict=False) for user in users: team = get_user_team(user) trophy = Award.give_if_needed( codename="f2p_orangered" if team == "red" else "f2p_periwinkle", user=user, ) if trophy: trophy._commit() sleep(.5)
def get_recent_name_submissions(): link_fullnames = list(queries.get_links(SERVERNAME_SR, "new", "all")) links = chain.from_iterable(Thing._by_fullname(chunk, return_dict=False) for chunk in in_chunks(link_fullnames)) for link in links: if link._deleted or link._spam: continue # OH GOD WHAT HAVE YOU POSTED IN MY LOVELY AUTOMATED SUBREDDIT!? if (not hasattr(link, "revenue_date") or not hasattr(link, "revenue_bucket") or not hasattr(link, "server_names")): continue yield link
def port_deleted_links(after_id=None): from r2.models import Link from r2.lib.db.operators import desc from r2.models.query_cache import CachedQueryMutator from r2.lib.db.queries import get_deleted_links from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc("_date"), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def bulk_load(self, start='', end='', chunk_size = 100): """Try to load everything out of Cassandra and put it into memcached""" cf = self.cassa.cf for rows in in_chunks(cf.get_range(start=start, finish=end, columns=['value']), chunk_size): print rows[0][0] rows = dict((key, pickle.loads(cols['value'])) for (key, cols) in rows if (cols # hack and len(key) < 250)) self.memcache.set_multi(rows)
def rebuild_link_index(start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api='CLOUDSEARCH_DOC_API', estimate=50000000, chunk_size=1000): cache_key = _REBUILD_INDEX_CACHE_KEY % uploader.__name__.lower() doc_api = getattr(g, doc_api) uploader = uploader(doc_api) if start_at is _REBUILD_INDEX_CACHE_KEY: start_at = g.cache.get(cache_key) if not start_at: raise ValueError("Told me to use '%s' key, but it's not set" % cache_key) q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'), data=True) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): #uploader.things = chunk uploader.fullnames = [link._fullname for link in chunk] for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] g.cache.set(cache_key, last_update._fullname) time.sleep(sleeptime)
def port_deleted_links(after_id=None): from r2.models import Link from r2.lib.db.operators import desc from r2.models.query_cache import CachedQueryMutator from r2.lib.db.queries import get_deleted_links from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._deleted == True, Link.c._spam == (True, False), sort=desc('_date'), data=True) q = fetch_things2(q, chunk_size=500) q = progress(q, verbosity=1000) for chunk in in_chunks(q): with CachedQueryMutator() as m: for link in chunk: query = get_deleted_links(link.author_id) m.insert(query, [link])
def rebuild_index(after_id = None, estimate=10000000): cls = Link # don't pull spam/deleted q = cls._query(sort=desc('_date'), data=True) if after_id: q._after(cls._byID(after_id)) q = fetch_things2(q) def key(link): # we're going back in time, so this will give us a good idea # of how far we've gone return "%s/%s" % (link._id, link._date) q = progress(q, verbosity=1000, estimate=estimate, persec=True, key=key) for chunk in in_chunks(q): inject(chunk)
def rebuild_index(after_id=None, estimate=10000000): cls = Link # don't pull spam/deleted q = cls._query(sort=desc('_date'), data=True) if after_id: q._after(cls._byID(after_id)) q = fetch_things2(q) def key(link): # we're going back in time, so this will give us a good idea # of how far we've gone return "%s/%s" % (link._id, link._date) q = progress(q, verbosity=1000, estimate=estimate, persec=True, key=key) for chunk in in_chunks(q): inject(chunk)
def backfill_vote_details(cls): ninety_days = timedelta(days=90).total_seconds() for chunk in in_chunks(cls._all(), size=100): detail_chunk = defaultdict(dict) try: with VoterIPByThing._cf.batch( write_consistency_level=cls._write_consistency_level) as b: for vote_list in chunk: thing_id36 = vote_list._id thing_fullname = vote_list.votee_fullname details = vote_list.decode_details() for detail in details: voter_id36 = detail["voter_id"] if "ip" in detail and detail["ip"]: ip = detail["ip"] redacted = dict(detail) del redacted["ip"] cast = detail["date"] now = epoch_seconds( datetime.utcnow().replace(tzinfo=g.tz)) ttl = ninety_days - (now - cast) oneweek = "" if ttl < 3600 * 24 * 7: oneweek = "(<= one week left)" print "Inserting %s with IP ttl %d %s" % ( redacted, ttl, oneweek) detail_chunk[thing_id36][voter_id36] = json.dumps( redacted) if ttl <= 0: print "Skipping bogus ttl for %s: %d" % ( redacted, ttl) continue b.insert(thing_fullname, {voter_id36: ip}, ttl=ttl) except Exception: # Getting some really weird spurious errors here; complaints about negative # TTLs even though they can't possibly be negative, errors from cass # that have an explanation of "(why=')" # Just going to brute-force this through. We might lose 100 here and there # but mostly it'll be intact. pass for votee_id36, valuedict in detail_chunk.iteritems(): cls._set_values(votee_id36, valuedict)
def _populate(after_id=None, estimate=54301242): from r2.models import desc from r2.lib.db import tdb_cassandra from r2.lib import utils # larger has a chance to decrease the number of Cassandra writes, # but the probability is low chunk_size = 5000 q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc("_date")) if after_id is not None: q._after(Comment._byID(after_id)) q = utils.fetch_things2(q, chunk_size=chunk_size) q = utils.progress(q, verbosity=chunk_size, estimate=estimate) for chunk in utils.in_chunks(q, chunk_size): chunk = filter(lambda x: hasattr(x, "link_id"), chunk) update_comment_votes(chunk)
def rebuild_link_index( start_at=None, sleeptime=1, cls=Link, uploader=LinkUploader, doc_api="CLOUDSEARCH_DOC_API", estimate=50000000, chunk_size=1000, ): cache_key = _REBUILD_INDEX_CACHE_KEY % uploader.__name__.lower() doc_api = getattr(g, doc_api) uploader = uploader(doc_api) if start_at is _REBUILD_INDEX_CACHE_KEY: start_at = g.cache.get(cache_key) if not start_at: raise ValueError("Told me to use '%s' key, but it's not set" % cache_key) q = cls._query(cls.c._deleted == (True, False), sort=desc("_date"), data=True) if start_at: after = cls._by_fullname(start_at) assert isinstance(after, cls) q._after(after) q = r2utils.fetch_things2(q, chunk_size=chunk_size) q = r2utils.progress(q, verbosity=1000, estimate=estimate, persec=True, key=_progress_key) for chunk in r2utils.in_chunks(q, size=chunk_size): uploader.things = chunk for x in range(5): try: uploader.inject() except httplib.HTTPException as err: print "Got %s, sleeping %s secs" % (err, x) time.sleep(x) continue else: break else: raise err last_update = chunk[-1] g.cache.set(cache_key, last_update._fullname) time.sleep(sleeptime)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrl from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc("_date"), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, "url", "self") != "self" and not getattr(l, "is_self", False)) chunks = in_chunks(q, 500) for chunk in chunks: with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b: for l in chunk: k = LinksByUrl._key_from_url(l.url) if k: b.insert(k, {l._id36: l._id36})
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrlAndSubreddit from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id,data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: for l in chunk: LinksByUrlAndSubreddit.add_link(l)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrlAndSubreddit from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: for l in chunk: LinksByUrlAndSubreddit.add_link(l)
def _location_by_ips(ips): if not hasattr(g, 'geoip_location'): g.log.warning("g.geoip_location not set. skipping GeoIP lookup.") return {} ret = {} for batch in in_chunks(ips, MAX_IPS_PER_GROUP): ip_string = '+'.join(batch) url = os.path.join(g.geoip_location, 'geoip', ip_string) try: response = urllib2.urlopen(url=url, timeout=3) json_data = response.read() except (urllib2.URLError, httplib.HTTPException, socket.error) as e: g.log.warning("Failed to fetch GeoIP information: %r" % e) continue try: ret.update(json.loads(json_data)) except ValueError, e: g.log.warning("Invalid JSON response for GeoIP lookup: %r" % e) continue
def _populate(after_id=None, estimate=54301242): from r2.models import desc from r2.lib.db import tdb_cassandra from r2.lib import utils # larger has a chance to decrease the number of Cassandra writes, # but the probability is low chunk_size = 5000 q = Comment._query(Comment.c._spam == (True, False), Comment.c._deleted == (True, False), sort=desc('_date')) if after_id is not None: q._after(Comment._byID(after_id)) q = utils.fetch_things2(q, chunk_size=chunk_size) q = utils.progress(q, verbosity=chunk_size, estimate=estimate) for chunk in utils.in_chunks(q, chunk_size): chunk = filter(lambda x: hasattr(x, 'link_id'), chunk) update_comment_votes(chunk)
def backfill_vote_details(cls): ninety_days = timedelta(days=90).total_seconds() for chunk in in_chunks(cls._all(), size=100): detail_chunk = defaultdict(dict) try: with VoterIPByThing._cf.batch(write_consistency_level=cls._write_consistency_level) as b: for vote_list in chunk: thing_id36 = vote_list._id thing_fullname = vote_list.votee_fullname details = vote_list.decode_details() for detail in details: voter_id36 = detail["voter_id"] if "ip" in detail and detail["ip"]: ip = detail["ip"] redacted = dict(detail) del redacted["ip"] cast = detail["date"] now = epoch_seconds(datetime.utcnow().replace(tzinfo=g.tz)) ttl = ninety_days - (now - cast) oneweek = "" if ttl < 3600 * 24 * 7: oneweek = "(<= one week left)" print "Inserting %s with IP ttl %d %s" % (redacted, ttl, oneweek) detail_chunk[thing_id36][voter_id36] = json.dumps(redacted) if ttl <= 0: print "Skipping bogus ttl for %s: %d" % (redacted, ttl) continue b.insert(thing_fullname, {voter_id36: ip}, ttl=ttl) except Exception: # Getting some really weird spurious errors here; complaints about negative # TTLs even though they can't possibly be negative, errors from cass # that have an explanation of "(why=')" # Just going to brute-force this through. We might lose 100 here and there # but mostly it'll be intact. pass for votee_id36, valuedict in detail_chunk.iteritems(): cls._set_values(votee_id36, valuedict)
def port_cassaurls(after_id=None, estimate=15231317): from r2.models import Link, LinksByUrl from r2.lib.db import tdb_cassandra from r2.lib.db.operators import desc from r2.lib.db.tdb_cassandra import CL from r2.lib.utils import fetch_things2, in_chunks, progress q = Link._query(Link.c._spam == (True, False), sort=desc('_date'), data=True) if after_id: q._after(Link._byID(after_id, data=True)) q = fetch_things2(q, chunk_size=500) q = progress(q, estimate=estimate) q = (l for l in q if getattr(l, 'url', 'self') != 'self' and not getattr(l, 'is_self', False)) chunks = in_chunks(q, 500) for chunk in chunks: with LinksByUrl._cf.batch(write_consistency_level=CL.ONE) as b: for l in chunk: k = LinksByUrl._key_from_url(l.url) if k: b.insert(k, {l._id36: l._id36})
def pushup_permacache(verbosity=1000): """When putting cassandra into the permacache chain, we need to push everything up into the rest of the chain, so this is everything that uses the permacache, as of that check-in.""" from pylons import g from r2.models import Link, Subreddit, Account from r2.lib.db.operators import desc from r2.lib.comment_tree import comments_key, messages_key from r2.lib.utils import fetch_things2, in_chunks from r2.lib.utils import last_modified_key from r2.lib.promote import promoted_memo_key from r2.lib.subreddit_search import load_all_reddits from r2.lib.db import queries from r2.lib.cache import CassandraCacheChain authority = g.permacache.caches[-1] nonauthority = CassandraCacheChain(g.permacache.caches[1:-1]) def populate(keys): vals = authority.simple_get_multi(keys) if vals: nonauthority.set_multi(vals) def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_reddits() yield queries.get_all_comments().iden l_q = Link._query(Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc('_date'), data=True, ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, 'comments') a_q = Account._query(Account.c._spam == (True, False), sort=desc('_date'), ) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, 'overview') yield last_modified_key(account, 'commented') yield last_modified_key(account, 'submitted') yield last_modified_key(account, 'liked') yield last_modified_key(account, 'disliked') yield queries.get_comments(account, 'new', 'all').iden yield queries.get_submitted(account, 'new', 'all').iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subreddit._query(Subreddit.c._spam == (True, False), sort=desc('_date'), ) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, 'stylesheet_contents') yield queries.get_links(sr, 'hot', 'all').iden yield queries.get_links(sr, 'new', 'all').iden for sort in 'top', 'controversial': for time in 'hour', 'day', 'week', 'month', 'year', 'all': yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subreddit_messages(sr).iden yield queries.get_unread_subreddit_messages(sr).iden done = 0 for keys in in_chunks(gen_keys(), verbosity): g.reset_caches() done += len(keys) print 'Done %d: %r' % (done, keys[-1]) populate(keys)
def shorten_byurl_keys(): """We changed by_url keys from a format like byurl_google.com... to: byurl(1d5920f4b44b27a802bd77c4f0536f5a, google.com...) so that they would fit in memcache's 251-char limit """ from datetime import datetime from hashlib import md5 from r2.models import Link from r2.lib.filters import _force_utf8 from pylons import g from r2.lib.utils import fetch_things2, in_chunks from r2.lib.db.operators import desc from r2.lib.utils import base_url, progress # from link.py def old_by_url_key(url): prefix='byurl_' s = _force_utf8(base_url(url.lower())) return '%s%s' % (prefix, s) def new_by_url_key(url): maxlen = 250 template = 'byurl(%s,%s)' keyurl = _force_utf8(base_url(url.lower())) hexdigest = md5(keyurl).hexdigest() usable_len = maxlen-len(template)-len(hexdigest) return template % (hexdigest, keyurl[:usable_len]) verbosity = 1000 l_q = Link._query( Link.c._spam == (True, False), data=True, sort=desc('_date')) for links in ( in_chunks( progress( fetch_things2(l_q, verbosity), key = lambda link: link._date, verbosity=verbosity, estimate=int(9.9e6), persec=True, ), verbosity)): # only links with actual URLs links = filter(lambda link: (not getattr(link, 'is_self', False) and getattr(link, 'url', '')), links) # old key -> new key translate = dict((old_by_url_key(link.url), new_by_url_key(link.url)) for link in links) old = g.permacache.get_multi(translate.keys()) new = dict((translate[old_key], value) for (old_key, value) in old.iteritems()) g.permacache.set_multi(new)
def pushup_permacache(verbosity=1000): """When putting cassandra into the permacache chain, we need to push everything up into the rest of the chain, so this is everything that uses the permacache, as of that check-in.""" from pylons import app_globals as g from r2.models import Link, Subreddit, Account from r2.lib.db.operators import desc from r2.lib.comment_tree import comments_key, messages_key from r2.lib.utils import fetch_things2, in_chunks from r2.lib.utils import last_modified_key from r2.lib.promote import promoted_memo_key from r2.lib.subreddit_search import load_all_reddits from r2.lib.db import queries from r2.lib.cache import CassandraCacheChain authority = g.permacache.caches[-1] nonauthority = CassandraCacheChain(g.permacache.caches[1:-1]) def populate(keys): vals = authority.simple_get_multi(keys) if vals: nonauthority.set_multi(vals) def gen_keys(): yield promoted_memo_key # just let this one do its own writing load_all_reddits() yield queries.get_all_comments().iden l_q = Link._query( Link.c._spam == (True, False), Link.c._deleted == (True, False), sort=desc('_date'), data=True, ) for link in fetch_things2(l_q, verbosity): yield comments_key(link._id) yield last_modified_key(link, 'comments') a_q = Account._query( Account.c._spam == (True, False), sort=desc('_date'), ) for account in fetch_things2(a_q, verbosity): yield messages_key(account._id) yield last_modified_key(account, 'overview') yield last_modified_key(account, 'commented') yield last_modified_key(account, 'submitted') yield last_modified_key(account, 'liked') yield last_modified_key(account, 'disliked') yield queries.get_comments(account, 'new', 'all').iden yield queries.get_submitted(account, 'new', 'all').iden yield queries.get_liked(account).iden yield queries.get_disliked(account).iden yield queries.get_hidden(account).iden yield queries.get_saved(account).iden yield queries.get_inbox_messages(account).iden yield queries.get_unread_messages(account).iden yield queries.get_inbox_comments(account).iden yield queries.get_unread_comments(account).iden yield queries.get_inbox_selfreply(account).iden yield queries.get_unread_selfreply(account).iden yield queries.get_sent(account).iden sr_q = Subreddit._query( Subreddit.c._spam == (True, False), sort=desc('_date'), ) for sr in fetch_things2(sr_q, verbosity): yield last_modified_key(sr, 'stylesheet_contents') yield queries.get_links(sr, 'hot', 'all').iden yield queries.get_links(sr, 'new', 'all').iden for sort in 'top', 'controversial': for time in 'hour', 'day', 'week', 'month', 'year', 'all': yield queries.get_links(sr, sort, time, merge_batched=False).iden yield queries.get_spam_links(sr).iden yield queries.get_spam_comments(sr).iden yield queries.get_reported_links(sr).iden yield queries.get_reported_comments(sr).iden yield queries.get_subreddit_messages(sr).iden yield queries.get_unread_subreddit_messages(sr).iden done = 0 for keys in in_chunks(gen_keys(), verbosity): g.reset_caches() done += len(keys) print 'Done %d: %r' % (done, keys[-1]) populate(keys)