def process_votes(drain=False, limit=100): def _handle_votes(msgs, chan): to_do = [] uids = set() tids = set() for x in msgs: r = pickle.loads(x.body) uid, tid, dir, ip, organic, cheater = r print(uid, tid, dir, ip, organic, cheater) uids.add(uid) tids.add(tid) to_do.append((uid, tid, dir, ip, organic, cheater)) users = Account._byID(uids, data=True, return_dict=True) things = Thing._by_fullname(tids, data=True, return_dict=True) for uid, tid, dir, ip, organic, cheater in to_do: handle_vote(users[uid], things[tid], dir, ip, organic, cheater=cheater) amqp.handle_items('register_vote_q', _handle_votes, limit=limit, drain=drain)
def run_changed(drain=False): """ Run by `cron` (through `paster run`) on a schedule to update all Things that have been created or have changed since the last run. Note: unlike many queue-using functions, this one is run from cron and totally drains the queue before terminating """ @g.stats.amqp_processor('solrsearch_changes') def _run_changed(msgs, chan): print "changed: Processing %d items" % len(msgs) msgs = [strordict_fullname(msg.body) for msg in msgs] fullnames = set(msg['fullname'] for msg in msgs if not msg.get('boost_only')) things = Thing._by_fullname(fullnames, data=True, return_dict=False) things = [x for x in things if isinstance(x, indexed_types)] update_things = [x for x in things if not x._spam and not x._deleted] delete_things = [x for x in things if x._spam or x._deleted] with SolrConnection() as s: if update_things: tokenized = tokenize_things(update_things) s.add(tokenized) if delete_things: for i in delete_things: s.delete(id=i._fullname) amqp.handle_items('solrsearch_changes', _run_changed, limit=1000, drain=drain)
def consume_subreddit_query_queue(qname="subreddit_query_q", limit=1000): @g.stats.amqp_processor(qname) def process_message(msgs, chan): """Update get_links(), the Links by Subreddit precomputed query. get_links() is a CachedResult which is stored in permacache. To update these objects we need to do a read-modify-write which requires obtaining a lock. Sharding these updates by subreddit allows us to run multiple consumers (but ideally just one per shard) to avoid lock contention. """ from r2.lib.db.queries import add_queries, get_links link_names = {msg.body for msg in msgs} links = Link._by_fullname(link_names, return_dict=False) print 'Processing %r' % (links,) links_by_sr_id = defaultdict(list) for link in links: links_by_sr_id[link.sr_id].append(link) srs_by_id = Subreddit._byID(links_by_sr_id.keys(), stale=True) for sr_id, links in links_by_sr_id.iteritems(): with g.stats.get_timer("link_vote_processor.subreddit_queries"): sr = srs_by_id[sr_id] add_queries( queries=[get_links(sr, sort, "all") for sort in SORTS], insert_items=links, ) amqp.handle_items(qname, process_message, limit=limit)
def run_changed(drain=False, limit=100, sleep_time=10, verbose=True): """reddit-consumer-update_promos: amqp consumer of update_promos_q Handles asynch accepting/rejecting of ads that are scheduled to be live right now """ @g.stats.amqp_processor(UPDATE_QUEUE) def _run(msgs, chan): items = [json.loads(msg.body) for msg in msgs] if QUEUE_ALL in items: # QUEUE_ALL is just an indicator to run make_daily_promotions. # There's no promotion log to update in this case. print "Received %s QUEUE_ALL message(s)" % items.count(QUEUE_ALL) items = [i for i in items if i != QUEUE_ALL] make_daily_promotions() links = Link._by_fullname([i["link"] for i in items]) for item in items: PromotionLog.add( links[item['link']], "Finished remaking current promotions (this link " "was: %(message)s" % item) amqp.handle_items(UPDATE_QUEUE, _run, limit=limit, drain=drain, sleep_time=sleep_time, verbose=verbose)
def run_changed(drain=False, limit=100, sleep_time=10, verbose=False): """reddit-consumer-update_promos: amqp consumer of update_promos_q Handles asynch accepting/rejecting of ads that are scheduled to be live right now """ @g.stats.amqp_processor(UPDATE_QUEUE) def _run(msgs, chan): items = [json.loads(msg.body) for msg in msgs] if QUEUE_ALL in items: # QUEUE_ALL is just an indicator to run make_daily_promotions. # There's no promotion log to update in this case. items.remove(QUEUE_ALL) make_daily_promotions() links = Link._by_fullname([i["link"] for i in items]) for item in items: PromotionLog.add( links[c.link_id], "Finished remaking current promotions (this link " " was: %(message)s" % item, commit=True, ) amqp.handle_items(UPDATE_QUEUE, _run, limit=limit, drain=drain, sleep_time=sleep_time, verbose=verbose)
def run_changed(drain=False): """ Run by `cron` (through `paster run`) on a schedule to update all Things that have been created or have changed since the last run. Note: unlike many queue-using functions, this one is run from cron and totally drains the queue before terminating """ def _run_changed(msgs, chan): print "changed: Processing %d items" % len(msgs) msgs = [strordict_fullname(msg.body) for msg in msgs] fullnames = set(msg['fullname'] for msg in msgs) things = Thing._by_fullname(fullnames, data=True, return_dict=False) things = [x for x in things if isinstance(x, indexed_types)] update_things = [x for x in things if not x._spam and not x._deleted] delete_things = [x for x in things if x._spam or x._deleted] with SolrConnection() as s: if update_things: tokenized = tokenize_things(update_things) s.add(tokenized) if delete_things: for i in delete_things: s.delete(id=i._fullname) amqp.handle_items('solrsearch_changes', _run_changed, limit=1000, drain=drain)
def consume_subreddit_query_queue(qname="subreddit_query_q", limit=1000): @g.stats.amqp_processor(qname) def process_message(msgs, chan): """Update get_links(), the Links by Subreddit precomputed query. get_links() is a CachedResult which is stored in permacache. To update these objects we need to do a read-modify-write which requires obtaining a lock. Sharding these updates by subreddit allows us to run multiple consumers (but ideally just one per shard) to avoid lock contention. """ from r2.lib.db.queries import add_queries, get_links link_names = {msg.body for msg in msgs} links = Link._by_fullname(link_names, return_dict=False) print 'Processing %r' % (links, ) links_by_sr_id = defaultdict(list) for link in links: links_by_sr_id[link.sr_id].append(link) srs_by_id = Subreddit._byID(links_by_sr_id.keys(), stale=True) for sr_id, links in links_by_sr_id.iteritems(): with g.stats.get_timer("link_vote_processor.subreddit_queries"): sr = srs_by_id[sr_id] add_queries( queries=[get_links(sr, sort, "all") for sort in SORTS], insert_items=links, ) amqp.handle_items(qname, process_message, limit=limit)
def process_comment_sorts(limit=500): def _handle_sort(msgs, chan): cids = list(set(int(msg.body) for msg in msgs)) comments = Comment._byID(cids, data = True, return_dict = False) print comments update_comment_votes(comments) amqp.handle_items('commentsort_q', _handle_sort, limit = limit)
def process_comment_sorts(limit=500): def _handle_sort(msgs, chan): cids = list(set(int(msg.body) for msg in msgs)) comments = Comment._byID(cids, data=True, return_dict=False) print comments update_comment_votes(comments) amqp.handle_items('commentsort_q', _handle_sort, limit=limit)
def run_changed(drain=False, limit=1000): """ Run by `cron` (through `paster run`) on a schedule to send Things to IndexTank """ def _run_changed(msgs, chan): start = datetime.now(g.tz) changed = map(lambda x: strordict_fullname(x.body), msgs) boost = set() add = set() # an item can request that only its boost fields be updated, # so we need to separate those out for item in changed: fname = item['fullname'] boost_only = item.get('boost_only', False) if fname in add: # we're already going to do all of the work continue #boo if boost_only: if False: boost.add(fname) else: if fname in boost: # we've previously seen an instance of this fname # that requested that only its boosts be updated, # but now we have to update the whole thing boost.remove(fname) add.add(fname) things = Thing._by_fullname(boost | add, data=True, return_dict=True) boost_time = add_time = 0.0 if boost: boost_time = inject([things[fname] for fname in boost], boost_only=True) if add: add_time = inject([things[fname] for fname in add]) totaltime = epoch_seconds(datetime.now(g.tz)) - epoch_seconds(start) print ("%s: %d messages: %d docs (%.2fs), %d boosts (%.2fs) in %.2fs (%d duplicates, %s remaining)" % (start, len(changed), len(add), add_time, len(boost), boost_time, totaltime, len(changed) - len(things), msgs[-1].delivery_info.get('message_count', 'unknown'), )) amqp.handle_items('indextank_changes', _run_changed, limit=limit, drain=drain, verbose=False)
def run_changed(drain=False, limit=1000): """ Run by `cron` (through `paster run`) on a schedule to send Things to IndexTank """ def _run_changed(msgs, chan): start = datetime.now(g.tz) changed = map(lambda x: strordict_fullname(x.body), msgs) boost = set() add = set() # an item can request that only its boost fields be updated, # so we need to separate those out for item in changed: fname = item['fullname'] boost_only = item.get('boost_only', False) if fname in add: # we're already going to do all of the work continue if boost_only: boost.add(fname) else: if fname in boost: # we've previously seen an instance of this fname # that requested that only its boosts be updated, # but now we have to update the whole thing boost.remove(fname) add.add(fname) things = Thing._by_fullname(boost | add, data=True, return_dict=True) boost_time = add_time = 0.0 if boost: boost_time = inject([things[fname] for fname in boost], boost_only=True) if add: add_time = inject([things[fname] for fname in add]) totaltime = epoch_seconds(datetime.now(g.tz)) - epoch_seconds(start) print ("%s: %d messages: %d docs (%.2fs), %d boosts (%.2fs) in %.2fs (%d duplicates, %s remaining)" % (start, len(changed), len(add), add_time, len(boost), boost_time, totaltime, len(changed) - len(things), msgs[-1].delivery_info.get('message_count', 'unknown'), )) amqp.handle_items('indextank_changes', _run_changed, limit=limit, drain=drain, verbose=False)
def run_changed(self, drain=False, min_size=int(getattr(g, 'SOLR_MIN_BATCH', 500)), limit=1000, sleep_time=10, use_safe_get=False, verbose=False): '''Run by `cron` (through `paster run`) on a schedule to send Things to Cloud ''' if use_safe_get: CloudSearchUploader.use_safe_get = True amqp.handle_items('cloudsearch_changes', _run_changed, min_size=min_size, limit=limit, drain=drain, sleep_time=sleep_time, verbose=verbose)
def process_events(g, timeout=5.0, **kw): publisher = EventPublisher( g.events_collector_url, g.secrets["events_collector_key"], g.secrets["events_collector_secret"], g.useragent, g.stats, timeout=timeout, ) test_publisher = EventPublisher( g.events_collector_test_url, g.secrets["events_collector_key"], g.secrets["events_collector_secret"], g.useragent, g.stats, timeout=timeout, ) @g.stats.amqp_processor("event_collector") def processor(msgs, chan): events = [] test_events = [] for msg in msgs: headers = msg.properties.get("application_headers", {}) truncatable_field = headers.get("truncatable_field") event = PublishableEvent(msg.body, truncatable_field) if msg.delivery_info["routing_key"] == "event_collector_test": test_events.append(event) else: events.append(event) to_publish = itertools.chain( publisher.publish(events), test_publisher.publish(test_events), ) for response, sent in to_publish: if response.ok: g.log.info("Published %s events", len(sent)) else: g.log.warning( "Event send failed %s - %s", response.status_code, _get_reason(response), ) g.log.warning("Response headers: %r", response.headers) # if the events were too large, move them into a separate # queue to get them out of here, since they'll always fail if response.status_code == 413: for event in sent: amqp.add_item("event_collector_failed", event) else: response.raise_for_status() amqp.handle_items("event_collector", processor, **kw)
def run_changed(self, drain=False, min_size=int(getattr(g, 'solr_min_batch', 500)), limit=1000, sleep_time=10, use_safe_get=False, verbose=False): '''Run by `cron` (through `paster run`) on a schedule to send Things to Solr ''' if use_safe_get: SolrSearchUploader.use_safe_get = True amqp.handle_items('cloudsearch_changes', _run_changed, min_size=min_size, limit=limit, drain=drain, sleep_time=sleep_time, verbose=verbose)
def run(limit=1000, verbose=False): def myfunc(msgs, chan): if verbose: print "processing a batch" incrs = {} for msg in msgs: try: d = check_dict(msg.body) except TypeError: log_text("usage_q error", "wtf is %r" % msg.body, "error") continue hund_sec = hund_from_start_and_end(d["start_time"], d["end_time"]) action = d["action"].replace("-", "_") fudged_count = int(1 / d["sampling_rate"]) fudged_elapsed = int(hund_sec / d["sampling_rate"]) for exp_time, bucket in buckets(d["end_time"]): k = "%s-%s" % (bucket, action) incrs.setdefault(k, [0, 0, exp_time]) incrs[k][0] += fudged_count incrs[k][1] += fudged_elapsed for k, (count, elapsed, exp_time) in incrs.iteritems(): c_key = "profile_count-" + k e_key = "profile_elapsed-" + k if verbose: c_old = g.hardcache.get(c_key) e_old = g.hardcache.get(e_key) g.hardcache.accrue(c_key, delta=count, time=exp_time) g.hardcache.accrue(e_key, delta=elapsed, time=exp_time) if verbose: c_new = g.hardcache.get(c_key) e_new = g.hardcache.get(e_key) print "%s: %s -> %s" % (c_key, c_old, c_new) print "%s: %s -> %s" % (e_key, e_old, e_new) if len(msgs) < limit / 2: if verbose: print "Sleeping..." sleep(10) amqp.handle_items(q, myfunc, limit=limit, drain=False, verbose=verbose, sleep_time=30)
def run_changed(drain=False, min_size=500, limit=1000, sleep_time=10, use_safe_get=False, verbose=False): '''Run by `cron` (through `paster run`) on a schedule to send Things to Amazon CloudSearch ''' if use_safe_get: CloudSearchUploader.use_safe_get = True amqp.handle_items('cloudsearch_changes', _run_changed, min_size=min_size, limit=limit, drain=drain, sleep_time=sleep_time, verbose=verbose)
def run_commentstree(limit=100): """Add new incoming comments to their respective comments trees""" def _run_commentstree(msgs, chan): comments = Comment._by_fullname([msg.body for msg in msgs], data=True, return_dict=False) print 'Processing %r' % (comments, ) add_comment_tree(comments) amqp.handle_items('commentstree_q', _run_commentstree, limit=limit)
def run_commentstree(limit=100): """Add new incoming comments to their respective comments trees""" @g.stats.amqp_processor("commentstree_q") def _run_commentstree(msgs, chan): comments = Comment._by_fullname([msg.body for msg in msgs], data=True, return_dict=False) print "Processing %r" % (comments,) add_comment_tree(comments) amqp.handle_items("commentstree_q", _run_commentstree, limit=limit)
def run_new_comments(): """Add new incoming comments to the /comments page""" # this is done as a queue because otherwise the contention for the # lock on the query would be very high def _run_new_comments(msgs, chan): fnames = [msg.body for msg in msgs] comments = Comment._by_fullname(fnames, data=True, return_dict=False) add_queries([get_all_comments()], insert_items=comments) amqp.handle_items("newcomments_q", _run_new_comments, limit=100)
def run_changed(drain=False): """ Run by `cron` (through `paster run`) on a schedule to send Things to IndexTank """ def _run_changed(msgs, chan): fullnames = set([x.body for x in msgs]) things = Thing._by_fullname(fullnames, data=True, return_dict=False) inject(things) amqp.handle_items('indextank_changes', _run_changed, limit=1000, drain=drain)
def run_new_comments(): """Add new incoming comments to the /comments page""" # this is done as a queue because otherwise the contention for the # lock on the query would be very high def _run_new_comments(msgs, chan): fnames = [msg.body for msg in msgs] comments = Comment._by_fullname(fnames, data=True, return_dict=False) add_queries([get_all_comments()], insert_items=comments) amqp.handle_items('newcomments_q', _run_new_comments, limit=100)
def run(limit=1000, verbose=False): def myfunc(msgs, chan): if verbose: print "processing a batch" incrs = {} for msg in msgs: try: d = check_dict(msg.body) except TypeError: log_text("usage_q error", "wtf is %r" % msg.body, "error") continue hund_sec = hund_from_start_and_end(d["start_time"], d["end_time"]) action = d["action"].replace("-", "_") fudged_count = int( 1 / d["sampling_rate"]) fudged_elapsed = int(hund_sec / d["sampling_rate"]) for exp_time, bucket in buckets(d["end_time"]): k = "%s-%s" % (bucket, action) incrs.setdefault(k, [0, 0, exp_time]) incrs[k][0] += fudged_count incrs[k][1] += fudged_elapsed for k, (count, elapsed, exp_time) in incrs.iteritems(): c_key = "profile_count-" + k e_key = "profile_elapsed-" + k if verbose: c_old = g.hardcache.get(c_key) e_old = g.hardcache.get(e_key) g.hardcache.accrue(c_key, delta=count, time=exp_time) g.hardcache.accrue(e_key, delta=elapsed, time=exp_time) if verbose: c_new = g.hardcache.get(c_key) e_new = g.hardcache.get(e_key) print "%s: %s -> %s" % (c_key, c_old, c_new) print "%s: %s -> %s" % (e_key, e_old, e_new) if len(msgs) < limit / 2: if verbose: print "Sleeping..." sleep (10) amqp.handle_items(q, myfunc, limit=limit, drain=False, verbose=verbose, sleep_time = 30)
def run_new_comments(limit=1000): """Add new incoming comments to the /comments page""" # this is done as a queue because otherwise the contention for the # lock on the query would be very high @g.stats.amqp_processor('newcomments_q') def _run_new_comments(msgs, chan): fnames = [msg.body for msg in msgs] comments = Comment._by_fullname(fnames, data=True, return_dict=False) add_queries([get_all_comments()], insert_items=comments) bysrid = _by_srid(comments, False) for srid, sr_comments in bysrid.iteritems(): add_queries([_get_sr_comments(srid)], insert_items=sr_comments) amqp.handle_items('newcomments_q', _run_new_comments, limit=limit)
def run_new_comments(limit=1000): """Add new incoming comments to the /comments page""" # this is done as a queue because otherwise the contention for the # lock on the query would be very high @g.stats.amqp_processor("newcomments_q") def _run_new_comments(msgs, chan): fnames = [msg.body for msg in msgs] comments = Comment._by_fullname(fnames, data=True, return_dict=False) add_queries([get_all_comments()], insert_items=comments) bysrid = _by_srid(comments, False) for srid, sr_comments in bysrid.iteritems(): add_queries([_get_sr_comments(srid)], insert_items=sr_comments) amqp.handle_items("newcomments_q", _run_new_comments, limit=limit)
def run(): def callback(msgs, chan): for msg in msgs: # will be len==1 # cr is a r2.lib.db.queries.CachedResults cr = pickle.loads(msg.body) iden = cr.query._iden() working_key = working_prefix + iden key = prefix + iden last_time = g.memcache.get(key) # check to see if we've computed this job since it was # added to the queue if last_time and last_time > msg.timestamp: print 'skipping, already computed ', key return if not cr.preflight_check(): print 'skipping, preflight check failed', key return # check if someone else is working on this elif not g.memcache.add(working_key, 1, TIMEOUT): print 'skipping, someone else is working', working_key return print 'working: ', iden, cr.query._rules, cr.query._sort start = datetime.now() try: cr.update() g.memcache.set(key, datetime.now()) cr.postflight() finally: g.memcache.delete(working_key) done = datetime.now() q_time_s = (done - msg.timestamp).seconds proc_time_s = (done - start).seconds + ( (done - start).microseconds / 1000000.0) print('processed %s in %.6f seconds after %d seconds in queue' % (iden, proc_time_s, q_time_s)) amqp.handle_items('prec_links', callback, limit=1)
def process_votes(limit=None): # limit is taken but ignored for backwards compatibility def _handle_vote(msgs, chan): assert(len(msgs) == 1) msg = msgs[0] r = pickle.loads(msg.body) uid, tid, dir, ip, organic, cheater = r voter = Account._byID(uid, data=True) votee = Thing._by_fullname(tid, data = True) print (voter, votee, dir, ip, organic, cheater) handle_vote(voter, votee, dir, ip, organic, cheater = cheater) amqp.handle_items('register_vote_q', _handle_vote)
def run(): def callback(msgs, chan): for msg in msgs: # will be len==1 # cr is a r2.lib.db.queries.CachedResults cr = pickle.loads(msg.body) iden = cr.query._iden() working_key = working_prefix + iden key = prefix + iden last_time = g.memcache.get(key) # check to see if we've computed this job since it was # added to the queue if last_time and last_time > msg.timestamp: print 'skipping, already computed ', key return if not cr.preflight_check(): print 'skipping, preflight check failed', key return # check if someone else is working on this elif not g.memcache.add(working_key, 1, TIMEOUT): print 'skipping, someone else is working', working_key return print 'working: ', iden, cr.query._rules, cr.query._sort start = datetime.now() try: cr.update() g.memcache.set(key, datetime.now()) cr.postflight() finally: g.memcache.delete(working_key) done = datetime.now() q_time_s = (done - msg.timestamp).seconds proc_time_s = (done - start).seconds + ((done - start).microseconds/1000000.0) print ('processed %s in %.6f seconds after %d seconds in queue' % (iden, proc_time_s, q_time_s)) amqp.handle_items('prec_links', callback, limit = 1)
def run_changed(drain=False, min_size=1, limit=1000, sleep_time=10, use_safe_get=False, verbose=False): '''Run by `cron` (through `paster run`) on a schedule to send Things to Amazon CloudSearch ''' @g.stats.amqp_processor('cloudsearch_changes_q') def _run_changed(msgs, chan): '''Consume the cloudsearch_changes_q queue, and print reporting information on how long it took and how many remain ''' start = datetime.now(g.tz) changed = [pickle.loads(msg.body) for msg in msgs] fullnames = set() fullnames.update(LinkUploader.desired_fullnames(changed)) fullnames.update(SubredditUploader.desired_fullnames(changed)) things = Thing._by_fullname(fullnames, data=True, return_dict=False) link_uploader = LinkUploader(g.CLOUDSEARCH_DOC_API, things=things) subreddit_uploader = SubredditUploader(g.CLOUDSEARCH_SUBREDDIT_DOC_API, things=things) link_time = link_uploader.inject() subreddit_time = subreddit_uploader.inject() cloudsearch_time = link_time + subreddit_time totaltime = (datetime.now(g.tz) - start).total_seconds() print ("%s: %d messages in %.2fs seconds (%.2fs secs waiting on " "cloudsearch); %d duplicates, %s remaining)" % (start, len(changed), totaltime, cloudsearch_time, len(changed) - len(things), msgs[-1].delivery_info.get('message_count', 'unknown'))) if use_safe_get: CloudSearchUploader.use_safe_get = True amqp.handle_items('cloudsearch_changes_q', _run_changed, min_size=min_size, limit=limit, drain=drain, sleep_time=sleep_time, verbose=verbose)
def run_changed(drain=False, limit=100, sleep_time=10, verbose=True): """reddit-consumer-update_promos: amqp consumer of update_promos_q Handles asynch accepting/rejecting of ads that are scheduled to be live right now """ @g.stats.amqp_processor(UPDATE_QUEUE) def _run(msgs, chan): items = [json.loads(msg.body) for msg in msgs] if QUEUE_ALL in items: # QUEUE_ALL is just an indicator to run make_daily_promotions. # There's no promotion log to update in this case. print "Received %s QUEUE_ALL message(s)" % items.count(QUEUE_ALL) items = [i for i in items if i != QUEUE_ALL] make_daily_promotions() amqp.handle_items(UPDATE_QUEUE, _run, limit=limit, drain=drain, sleep_time=sleep_time, verbose=verbose)
def run(): def process_msgs(msgs, chan): def _process_link(fname): link = Link._by_fullname(fname, data=True, return_dict=False) set_media(link) for msg in msgs: fname = msg.body try: TimeoutFunction(_process_link, 30)(fname) except TimeoutFunctionException: print "Timed out on %s" % fname except KeyboardInterrupt: raise except: print "Error fetching %s" % fname print traceback.format_exc() amqp.handle_items('scraper_q', process_msgs, limit=1)
def process_votes(limit=1000): # limit is taken but ignored for backwards compatibility def _handle_vote(msgs, chan): #assert(len(msgs) == 1) comments = [] for msg in msgs: r = pickle.loads(msg.body) uid, tid, dir, ip, organic, cheater = r voter = Account._byID(uid, data=True) votee = Thing._by_fullname(tid, data=True) if isinstance(votee, Comment): comments.append(votee) print(voter, votee, dir, ip, organic, cheater) handle_vote(voter, votee, dir, ip, organic, cheater=cheater) update_comment_votes(comments) amqp.handle_items('register_vote_q', _handle_vote, limit=limit)
def process_votes(limit=1000): # limit is taken but ignored for backwards compatibility def _handle_vote(msgs, chan): # assert(len(msgs) == 1) comments = [] for msg in msgs: r = pickle.loads(msg.body) uid, tid, dir, ip, organic, cheater = r voter = Account._byID(uid, data=True) votee = Thing._by_fullname(tid, data=True) if isinstance(votee, Comment): comments.append(votee) print (voter, votee, dir, ip, organic, cheater) handle_vote(voter, votee, dir, ip, organic, cheater=cheater) update_comment_votes(comments) amqp.handle_items("register_vote_q", _handle_vote, limit=limit)
def process_votes(drain=False, limit=100): def _handle_votes(msgs, chan): to_do = [] uids = set() tids = set() for x in msgs: r = pickle.loads(x.body) uid, tid, dir, ip, organic, cheater = r print (uid, tid, dir, ip, organic, cheater) uids.add(uid) tids.add(tid) to_do.append((uid, tid, dir, ip, organic, cheater)) users = Account._byID(uids, data=True, return_dict=True) things = Thing._by_fullname(tids, data=True, return_dict=True) for uid, tid, dir, ip, organic, cheater in to_do: handle_vote(users[uid], things[tid], dir, ip, organic, cheater=cheater) amqp.handle_items("register_vote_q", _handle_votes, limit=limit, drain=drain)
def run_commentstree(): """Add new incoming comments to their respective comments trees""" def _run_commentstree(msgs, chan): fnames = [msg.body for msg in msgs] comments = Comment._by_fullname(fnames, data=True, return_dict=False) links = Link._byID(set(cm.link_id for cm in comments), data=True, return_dict=True) # add the comment to the comments-tree for comment in comments: l = links[comment.link_id] try: add_comment_tree(comment, l) except KeyError: # Hackity hack. Try to recover from a corrupted # comment tree print "Trying to fix broken comments-tree." link_comments(l._id, _update=True) add_comment_tree(comment, l) amqp.handle_items("commentstree_q", _run_commentstree, limit=1)
def run_commentstree(): """Add new incoming comments to their respective comments trees""" def _run_commentstree(msgs, chan): fnames = [msg.body for msg in msgs] comments = Comment._by_fullname(fnames, data=True, return_dict=False) links = Link._byID(set(cm.link_id for cm in comments), data=True, return_dict=True) # add the comment to the comments-tree for comment in comments: l = links[comment.link_id] try: add_comment_tree(comment, l) except KeyError: # Hackity hack. Try to recover from a corrupted # comment tree print "Trying to fix broken comments-tree." link_comments(l._id, _update=True) add_comment_tree(comment, l) amqp.handle_items('commentstree_q', _run_commentstree, limit=1)
def consume_domain_query_queue(qname="domain_query_q", limit=1000): @g.stats.amqp_processor(qname) def process_message(msgs, chan): """Update get_domain_links(), the Links by domain precomputed query. get_domain_links() is a CachedResult which is stored in permacache. To update these objects we need to do a read-modify-write which requires obtaining a lock. Sharding these updates by domain allows us to run multiple consumers (but ideally just one per shard) to avoid lock contention. """ from r2.lib.db.queries import add_queries, get_domain_links link_names = {msg.body for msg in msgs} links = Link._by_fullname(link_names, return_dict=False) print 'Processing %r' % (links, ) links_by_domain = defaultdict(list) for link in links: parsed = UrlParser(link.url) # update the listings for all permutations of the link's domain for domain in parsed.domain_permutations(): links_by_domain[domain].append(link) for d, links in links_by_domain.iteritems(): with g.stats.get_timer("link_vote_processor.domain_queries"): add_queries( queries=[ get_domain_links(d, sort, "all") for sort in SORTS ], insert_items=links, ) amqp.handle_items(qname, process_message, limit=limit)
def consume_domain_query_queue(qname="domain_query_q", limit=1000): @g.stats.amqp_processor(qname) def process_message(msgs, chan): """Update get_domain_links(), the Links by domain precomputed query. get_domain_links() is a CachedResult which is stored in permacache. To update these objects we need to do a read-modify-write which requires obtaining a lock. Sharding these updates by domain allows us to run multiple consumers (but ideally just one per shard) to avoid lock contention. """ from r2.lib.db.queries import add_queries, get_domain_links link_names = {msg.body for msg in msgs} links = Link._by_fullname(link_names, return_dict=False) print 'Processing %r' % (links,) links_by_domain = defaultdict(list) for link in links: parsed = UrlParser(link.url) # update the listings for all permutations of the link's domain for domain in parsed.domain_permutations(): links_by_domain[domain].append(link) for d, links in links_by_domain.iteritems(): with g.stats.get_timer("link_vote_processor.domain_queries"): add_queries( queries=[ get_domain_links(d, sort, "all") for sort in SORTS], insert_items=links, ) amqp.handle_items(qname, process_message, limit=limit)
uid, tid, dir, ip, organic, cheater = r voter = Account._byID(uid, data=True) votee = Thing._by_fullname(tid, data = True) if isinstance(votee, Comment): comments.append(votee) if not isinstance(votee, (Link, Comment)): # I don't know how, but somebody is sneaking in votes # for subreddits continue print (voter, votee, dir, ip, organic, cheater) try: handle_vote(voter, votee, dir, ip, organic, cheater=cheater, foreground=False) except Exception, e: print 'Rejecting %r:%r because of %r' % (msg.delivery_tag, r,e) chan.basic_reject(msg.delivery_tag, requeue=True) update_comment_votes(comments) amqp.handle_items(qname, _handle_vote, limit = limit) process_votes = process_votes_single try: from r2admin.lib.admin_queries import * except ImportError: pass
if not isinstance(votee, (Link, Comment)): # I don't know how, but somebody is sneaking in votes # for subreddits continue print (voter, votee, dir, ip, organic, cheater) try: handle_vote(voter, votee, dir, ip, organic, cheater=cheater, foreground=False) except Exception, e: print 'Rejecting %r:%r because of %r' % (msg.delivery_tag, r,e) chan.basic_reject(msg.delivery_tag, requeue=True) update_comment_votes(comments) amqp.handle_items('register_vote_q', _handle_vote, limit = limit) process_votes = process_votes_single def process_comment_sorts(limit=500): def _handle_sort(msgs, chan): cids = list(set(int(msg.body) for msg in msgs)) comments = Comment._byID(cids, data = True, return_dict = False) print comments update_comment_votes(comments) amqp.handle_items('commentsort_q', _handle_sort, limit = limit) try: from r2admin.lib.admin_queries import * except ImportError:
def run(limit=100, streamfile=None, verbose=False): if streamfile: stream_fp = open(streamfile, "a") else: stream_fp = None def streamlog(msg, important=False): if stream_fp: stream_fp.write(msg + "\n") stream_fp.flush() if important: print msg def add_timestamps (d): d['hms'] = d['time'].strftime("%H:%M:%S") d['occ'] = "<%s:%s, pid=%-5s, %s>" % (d['host'], d['port'], d['pid'], d['time'].strftime("%Y-%m-%d %H:%M:%S")) def limited_append(l, item): if len(l) >= 25: l.pop(12) l.append(item) def log_exception(d, daystring): exc_desc = d['exception_desc'] exc_type = d['exception_type'] exc_str = "%s: %s" % (exc_type, exc_desc) add_timestamps(d) tb = [] key_material = exc_type pretty_lines = [] make_lock_seen = False flaky_db_seen = False for tpl in d['traceback']: tb.append(tpl) filename, lineno, funcname, text = tpl if text is None: pass elif (text.startswith("with g.make_lock(") or text.startswith("with make_lock(")): make_lock_seen = True elif (text.startswith("(ProgrammingError) server closed the connection")): flaky_db_seen = True key_material += "%s %s " % (filename, funcname) pretty_lines.append ("%s:%s: %s()" % (filename, lineno, funcname)) pretty_lines.append (" %s" % text) if exc_desc.startswith("QueuePool limit of size"): fingerprint = "QueuePool_overflow" elif exc_desc.startswith("error 2 from memcached_get: HOSTNAME "): fingerprint = "memcache_suckitude" elif exc_type == "TimeoutExpired" and make_lock_seen: fingerprint = "make_lock_timeout" elif exc_desc.startswith("(OperationalError) FATAL: the database " + "system is in recovery mode"): fingerprint = "recovering_db" elif exc_desc.startswith("(OperationalError) could not connect " + "to server"): fingerprint = "unconnectable_db" elif exc_desc.startswith("(OperationalError) server closed the " + "connection unexpectedly"): fingerprint = "flaky_db_op" elif exc_type == "ProgrammingError" and flaky_db_seen: fingerprint = "flaky_db_prog" # SQLAlchemy includes the entire query in the exception # description which can sometimes be gigantic, in the case of # SELECTs. Get rid of it. select_pos = exc_str.find("SELECT") if select_pos > 0: exc_str = exc_str[pos] elif exc_type == "NoServerAvailable": fingerprint = "cassandra_suckitude" else: fingerprint = md5(key_material).hexdigest() nickname_key = "error_nickname-" + fingerprint status_key = "error_status-" + fingerprint nickname = g.hardcache.get(nickname_key) if nickname is None: nickname = '"%s" Exception' % randword().capitalize() news = ("A new kind of thing just happened! " + "I'm going to call it a %s\n\n" % nickname) news += "Where and when: %s\n\n" % d['occ'] news += "Traceback:\n" news += "\n".join(pretty_lines) news += exc_str news += "\n" emailer.nerds_email(news, "Exception Watcher") g.hardcache.set(nickname_key, nickname, 86400 * 365) g.hardcache.set(status_key, "new", 86400) if g.hardcache.get(status_key) == "fixed": g.hardcache.set(status_key, "new", 86400) news = "This was marked as fixed: %s\n" % nickname news += "But it just occurred, so I'm marking it new again." emailer.nerds_email(news, "Exception Watcher") err_key = "-".join(["error", daystring, fingerprint]) existing = g.hardcache.get(err_key) if not existing: existing = dict(exception=exc_str, traceback=tb, occurrences=[]) limited_append(existing['occurrences'], d['occ']) g.hardcache.set(err_key, existing, 7 * 86400) streamlog ("%s [X] %-70s" % (d['hms'], nickname), verbose) def log_text(d, daystring): add_timestamps(d) char = d['level'][0].upper() streamlog ("%s [%s] %r" % (d['hms'], char, d['text']), verbose) logclass_key = "logclass-" + d['classification'] if not g.hardcache.get(logclass_key): g.hardcache.set(logclass_key, True, 86400 * 90) if d['level'] != 'debug': news = "The code just generated a [%s] message.\n" % \ d['classification'] news += "I don't remember ever seeing one of those before.\n" news += "\n" news += "It happened on: %s\n" % d['occ'] news += "The log level was: %s\n" % d['level'] news += "The complete text was:\n" news += repr(d['text']) emailer.nerds_email (news, "reddit secretary") occ_key = "-".join(["logtext", daystring, d['level'], d['classification']]) occurrences = g.hardcache.get(occ_key) if occurrences is None: occurrences = [] d2 = {} d2['occ'] = d['occ'] d2['text'] = repr(d['text']) limited_append(occurrences, d2) g.hardcache.set(occ_key, occurrences, 86400 * 7) def myfunc(msgs, chan): daystring = datetime.now(g.display_tz).strftime("%Y/%m/%d") for msg in msgs: try: d = pickle.loads(msg.body) except TypeError: streamlog ("wtf is %r" % msg.body, True) continue if not 'type' in d: streamlog ("wtf is %r" % d, True) elif d['type'] == 'exception': try: log_exception(d, daystring) except Exception as e: print "Error in log_exception(): %r" % e elif d['type'] == 'text': try: log_text(d, daystring) except Exception as e: print "Error in log_text(): %r" % e else: streamlog ("wtf is %r" % d['type'], True) amqp.handle_items(q, myfunc, limit=limit, drain=False, verbose=verbose)
try: handle_vote(voter, votee, dir, ip, organic, cheater=cheater, foreground=False) except Exception, e: print 'Rejecting %r:%r because of %r' % (msg.delivery_tag, r, e) chan.basic_reject(msg.delivery_tag, requeue=True) update_comment_votes(comments) amqp.handle_items('register_vote_q', _handle_vote, limit=limit) process_votes = process_votes_single def process_comment_sorts(limit=500): def _handle_sort(msgs, chan): cids = list(set(int(msg.body) for msg in msgs)) comments = Comment._byID(cids, data=True, return_dict=False) print comments update_comment_votes(comments) amqp.handle_items('commentsort_q', _handle_sort, limit=limit)
def run_realtime_email_queue(limit=1, debug=False): # Email new posts, comments or messages to whoever's set to get them # Called from reddit_consumer-realtime_email_q long running job from r2.lib import amqp from r2.models import Comment, Subreddit, Link, Thing, SaveHide from r2.lib.db.operators import asc, desc from r2.lib.utils import fetch_things2 import time run_realtime_email_queue.accounts = None run_realtime_email_queue.last_got_accounts = 0 @g.stats.amqp_processor('realtime_email_q') def _run_realtime_email_queue(msgs, chan): if time.time() - run_realtime_email_queue.last_got_accounts > 600: #-- Pick up a fresh list of accounts, if we havenn't done so recently, in case settings change if g.email_debug: g.log.info('Getting accounts') run_realtime_email_queue.accounts = Account._query(Account.c.email != None, sort = asc('_date'), data=True) run_realtime_email_queue.last_got_accounts = time.time() for msg in msgs: # msg.body contains the unique name of the post, comment or message, e.g. 't1_2n'(comment #95) or 't6_q'(post #26) fullname = str(msg.body) fullname_type = fullname[0:2] id36 = fullname[3:] if g.email_debug: g.log.info('msg: %r', fullname) howold = (datetime.datetime.now() - msg.timestamp).total_seconds() if howold < 110: # Wait until this item is 2 minutes old, to allow time for corrections if g.email_debug: g.log.info('waiting for a moment') time.sleep(120 - howold) is_com = is_post = False thing = link = comment = None if fullname_type == 't1': # a comment is_com = True comment = Comment._byID36(id36, data=True) if g.email_debug: g.log.info('comment: %r', comment.body) thing = comment author = Account._byID(comment.author_id, True) kind = Email.Kind.REALTIME_COMMENT template = 'email_realtime_comment.html' link = Link._byID(comment.link_id, data=True) subject = 'Re: %s' % link.title sr_id = comment.sr_id elif fullname_type == 't6': # a post/link is_post = True link = Link._byID36(id36, data=True) if g.email_debug: g.log.info('post: %r', link.title) thing = link author = Account._byID(link.author_id, True) kind = Email.Kind.REALTIME_POST template = 'email_realtime_post.html' subject = link.title sr_id = link.sr_id else: return sr = Subreddit._byID(sr_id, data=True) subject = "[%s] %s" % (sr.name, subject) for account in run_realtime_email_queue.accounts: sub = sr.get_subscriber(account) if is_com: if hasattr(sub,'email_comments') and sub.email_comments: if g.email_debug: g.log.info(' account %r: we should send this comment, because of the space setting', account.name) whysend = 'space' else: email_thread = Link._somethinged(SaveHide, account, link, 'email')[account,link,'email'] if email_thread: if g.email_debug: g.log.info(' account %r: we should send this comment, because of the thread setting', account.name) whysend = 'thread' else: continue elif is_post: if hasattr(sub,'email_posts') and sub.email_posts: if g.email_debug: g.log.info(' account %r: we should send this post', account.name) whysend = 'space' else: continue if not ('session' in locals()): # Open the SMTP session if g.email_debug: g.log.info('Opening SMTP session') session = open_smtp_session() # Render the template html_email_template = g.mako_lookup.get_template(template) html_body = html_email_template.render(link=link, comment=comment, thing=thing, account=account, sub=sub, whysend=whysend) from_email = '"%s" <%s>' % (g.realtime_email_from_name, g.share_reply,) send_html_email(account.email, g.share_reply, subject, html_body, from_full=from_email, session=session) if g.email_debug: g.log.info(' sent to %r at %r', account.name, account.email) if g.email_debug: g.log.info('Done running queue') if 'session' in locals(): # Close the session. session.quit() amqp.handle_items('realtime_email_q', _run_realtime_email_queue, limit = limit)
def run(limit=100, streamfile=None, verbose=False): if streamfile: stream_fp = open(streamfile, "a") else: stream_fp = None def streamlog(msg, important=False): if stream_fp: stream_fp.write(msg + "\n") stream_fp.flush() if important: print msg def add_timestamps(d): d['hms'] = d['time'].strftime("%H:%M:%S") d['occ'] = "<%s:%s, pid=%-5s, %s>" % ( d['host'], d['port'], d['pid'], d['time'].strftime("%Y-%m-%d %H:%M:%S")) def limited_append(l, item): if len(l) >= 25: l.pop(12) l.append(item) def log_exception(d, daystring): exc_desc = d['exception_desc'] exc_type = d['exception_type'] exc_str = "%s: %s" % (exc_type, exc_desc) add_timestamps(d) tb = [] key_material = exc_type pretty_lines = [] make_lock_seen = False flaky_db_seen = False for tpl in d['traceback']: tb.append(tpl) filename, lineno, funcname, text = tpl if text is None: pass elif (text.startswith("with g.make_lock(") or text.startswith("with make_lock(")): make_lock_seen = True elif (text.startswith( "(ProgrammingError) server closed the connection")): flaky_db_seen = True key_material += "%s %s " % (filename, funcname) pretty_lines.append("%s:%s: %s()" % (filename, lineno, funcname)) pretty_lines.append(" %s" % text) if exc_desc.startswith("QueuePool limit of size"): fingerprint = "QueuePool_overflow" elif exc_desc.startswith("error 2 from memcached_get: HOSTNAME "): fingerprint = "memcache_suckitude" elif exc_type == "TimeoutExpired" and make_lock_seen: fingerprint = "make_lock_timeout" elif exc_desc.startswith("(OperationalError) FATAL: the database " + "system is in recovery mode"): fingerprint = "recovering_db" elif exc_desc.startswith("(OperationalError) could not connect " + "to server"): fingerprint = "unconnectable_db" elif exc_desc.startswith("(OperationalError) server closed the " + "connection unexpectedly"): fingerprint = "flaky_db_op" elif exc_type == "ProgrammingError" and flaky_db_seen: fingerprint = "flaky_db_prog" # SQLAlchemy includes the entire query in the exception # description which can sometimes be gigantic, in the case of # SELECTs. Get rid of it. select_pos = exc_str.find("SELECT") if select_pos > 0: exc_str = exc_str[pos] elif exc_type == "NoServerAvailable": fingerprint = "cassandra_suckitude" else: fingerprint = md5(key_material).hexdigest() nickname_key = "error_nickname-" + fingerprint status_key = "error_status-" + fingerprint nickname = g.hardcache.get(nickname_key) if nickname is None: nickname = '"%s" Exception' % randword().capitalize() news = ("A new kind of thing just happened! " + "I'm going to call it a %s\n\n" % nickname) news += "Where and when: %s\n\n" % d['occ'] news += "Traceback:\n" news += "\n".join(pretty_lines) news += exc_str news += "\n" emailer.nerds_email(news, "Exception Watcher") g.hardcache.set(nickname_key, nickname, 86400 * 365) g.hardcache.set(status_key, "new", 86400) if g.hardcache.get(status_key) == "fixed": g.hardcache.set(status_key, "new", 86400) news = "This was marked as fixed: %s\n" % nickname news += "But it just occurred, so I'm marking it new again." emailer.nerds_email(news, "Exception Watcher") err_key = "-".join(["error", daystring, fingerprint]) existing = g.hardcache.get(err_key) if not existing: existing = dict(exception=exc_str, traceback=tb, occurrences=[]) limited_append(existing['occurrences'], d['occ']) g.hardcache.set(err_key, existing, 7 * 86400) streamlog("%s [X] %-70s" % (d['hms'], nickname), verbose) def log_text(d, daystring): add_timestamps(d) char = d['level'][0].upper() streamlog("%s [%s] %r" % (d['hms'], char, d['text']), verbose) logclass_key = "logclass-" + d['classification'] if not g.hardcache.get(logclass_key): g.hardcache.set(logclass_key, True, 86400 * 90) if d['level'] != 'debug': news = "The code just generated a [%s] message.\n" % \ d['classification'] news += "I don't remember ever seeing one of those before.\n" news += "\n" news += "It happened on: %s\n" % d['occ'] news += "The log level was: %s\n" % d['level'] news += "The complete text was:\n" news += repr(d['text']) emailer.nerds_email(news, "reddit secretary") occ_key = "-".join( ["logtext", daystring, d['level'], d['classification']]) occurrences = g.hardcache.get(occ_key) if occurrences is None: occurrences = [] d2 = {} d2['occ'] = d['occ'] d2['text'] = repr(d['text']) limited_append(occurrences, d2) g.hardcache.set(occ_key, occurrences, 86400 * 7) def myfunc(msgs, chan): daystring = datetime.now(g.display_tz).strftime("%Y/%m/%d") for msg in msgs: try: d = pickle.loads(msg.body) except TypeError: streamlog("wtf is %r" % msg.body, True) continue if not 'type' in d: streamlog("wtf is %r" % d, True) elif d['type'] == 'exception': try: log_exception(d, daystring) except Exception as e: print "Error in log_exception(): %r" % e elif d['type'] == 'text': try: log_text(d, daystring) except Exception as e: print "Error in log_text(): %r" % e else: streamlog("wtf is %r" % d['type'], True) amqp.handle_items(q, myfunc, limit=limit, drain=False, verbose=verbose)