def test_chunked(self): # chunking nothing yields nothing. eq_(list(chunked([], 1)), []) # chunking list where len(list) < n eq_(list(chunked([1], 10)), [[1]]) # chunking a list where len(list) == n eq_(list(chunked([1, 2], 2)), [[1, 2]]) # chunking list where len(list) > n eq_(list(chunked([1, 2, 3, 4, 5], 2)), [[1, 2], [3, 4], [5]]) # passing in a length overrides the real len(list) eq_(list(chunked([1, 2, 3, 4, 5, 6, 7], 2, length=4)), [[1, 2], [3, 4]])
def auto_archive_old_questions(): """Archive all questions that were created over 180 days ago""" # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list(Question.objects.filter(is_archived=False) .filter(created__lte=days_180) .values_list('id', flat=True)) if q_ids: log.info('Updating %d questions', len(q_ids)) sql = """ UPDATE questions_question SET is_archived = 1 WHERE id IN (%s) """ % ','.join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) if not transaction.get_connection().in_atomic_block: transaction.commit() if settings.ES_LIVE_INDEXING: try: # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from kitsune.search.utils import chunked for chunk in chunked(q_ids, 100): # Fetch all the documents we need to update. es_docs = get_documents(QuestionMappingType, chunk) log.info('Updating %d index documents', len(es_docs)) documents = [] # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_archived'] = True doc[u'indexed_on'] = int(time.time()) documents.append(doc) QuestionMappingType.bulk_index(documents) except ES_EXCEPTIONS: # Something happened with ES, so let's push index # updating into an index_task which retries when it # fails because of ES issues. index_task.delay(QuestionMappingType, q_ids)
def auto_archive_old_questions(): """Archive all questions that were created over 180 days ago""" # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list( Question.objects.filter(is_archived=False).filter( created__lte=days_180).values_list('id', flat=True)) if q_ids: log.info('Updating %d questions', len(q_ids)) sql = """ UPDATE questions_question SET is_archived = 1 WHERE id IN (%s) """ % ','.join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) if not transaction.get_connection().in_atomic_block: transaction.commit() if settings.ES_LIVE_INDEXING: try: # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from kitsune.search.utils import chunked for chunk in chunked(q_ids, 100): # Fetch all the documents we need to update. es_docs = get_documents(QuestionMappingType, chunk) log.info('Updating %d index documents', len(es_docs)) documents = [] # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_archived'] = True doc[u'indexed_on'] = int(time.time()) documents.append(doc) QuestionMappingType.bulk_index(documents) except ES_EXCEPTIONS: # Something happened with ES, so let's push index # updating into an index_task which retries when it # fails because of ES issues. index_task.delay(QuestionMappingType, q_ids)
def recalculate_karma_points(): """Go through all karma action data and recalculate points.""" if not waffle.switch_is_active('karma'): return for chunk in chunked(list(KarmaManager().user_ids()), 2500): _process_recalculate_chunk.apply_async(args=[chunk])
def rebuild_kb(): """Re-render all documents in the KB in chunks.""" cache.delete(settings.WIKI_REBUILD_TOKEN) d = Document.objects.using("default").filter(current_revision__isnull=False).values_list("id", flat=True) for chunk in chunked(d, 100): _rebuild_kb_chunk.apply_async(args=[chunk])
def init_karma(): """Flushes the karma redis backend and populates with fresh data. Goes through all questions/answers/votes and save karma actions for them. """ if not waffle.switch_is_active('karma'): return redis_client('karma').flushdb() questions = Question.objects.all() for chunk in chunked(questions.values_list('pk', flat=True), 200): _process_question_chunk.apply_async(args=[chunk]) votes = AnswerVote.objects.all() for chunk in chunked(votes.values_list('pk', flat=True), 1000): _process_answer_vote_chunk.apply_async(args=[chunk])
def rebuild_kb(): """Re-render all documents in the KB in chunks.""" cache.delete(settings.WIKI_REBUILD_TOKEN) d = (Document.objects.using('default').filter( current_revision__isnull=False).values_list('id', flat=True)) for chunk in chunked(d, 100): _rebuild_kb_chunk.apply_async(args=[chunk])
def forwards(self, orm): """Copy all the question.{topics,products} to {topic,product} FK.""" count = orm.Question.objects.count() for chunk in chunked(orm.Question.objects.all(), 2500, count): for question in chunk: question.product = question.products.first() question.topic = question.topics.first() if question.product or question.topic: question.save()
def change_and_reindex(self, orm, is_archived, is_locked): """Locks all questions that were created over 180 days ago""" # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) assert is_archived != is_locked f = Q(created__lte=days_180) if is_archived: f |= Q(is_locked=True) if is_locked: f |= Q(is_archived=True) # Update the DB (orm.Question.objects.filter(f).update( is_archived=is_archived, is_locked=is_locked)) # Using the efficient .update() of query sets doesn't emit any # signals, so live indexing won't automatically happen. This # does it manually. if settings.ES_LIVE_INDEXING: q_ids = list( orm.Question.objects.filter(f).values_list('id', flat=True)) try: # This is going to process about 200K questions in # production, so it will take a while and stress # everything. To alleviate this stress, it is # divided into chunks. for chunk in chunked(q_ids, 1000): # Fetch all the documents we need to update. es_docs = get_documents(QuestionMappingType, chunk) documents = [] # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_locked'] = is_locked doc[u'question_is_archived'] = is_archived doc[u'indexed_on'] = int(time.time()) documents.append(doc) if documents: QuestionMappingType.bulk_index(documents) except ES_EXCEPTIONS: # Something happened with ES, so let's push index # updating into an index_task which retries when it # fails because of ES issues. index_task.delay(QuestionMappingType, q_ids)
def delete_expired_users(self): """ Remove expired instances of this manager's object class. Accounts to be deleted are identified by searching for instances of this manager's object class with expired activation keys, and then checking to see if their associated ``User`` instances have the field ``is_active`` set to ``False``; any ``User`` who is both inactive and has an expired activation key will be deleted. """ days_valid = settings.ACCOUNT_ACTIVATION_DAYS expired = datetime.now() - timedelta(days=days_valid) prof_ids = self.filter(user__date_joined__lt=expired) prof_ids = prof_ids.values_list('id', flat=True) for chunk in chunked(prof_ids, 1000): _delete_registration_profiles_chunk.apply_async(args=[chunk])
def handle(self, **options): # Get all questions (id) with a vote in the last week. recent = datetime.now() - timedelta(days=7) q = QuestionVote.objects.filter(created__gte=recent) q = q.values_list('question_id', flat=True).order_by('question') q = q.distinct() q_with_recent_votes = list(q) # Get all questions with num_votes_past_week > 0 q = Question.objects.filter(num_votes_past_week__gt=0) q = q.values_list('id', flat=True) q_with_nonzero_votes = list(q) # Union. qs_to_update = list(set(q_with_recent_votes + q_with_nonzero_votes)) # Chunk them for tasks. for chunk in chunked(qs_to_update, 50): update_question_vote_chunk.apply_async(args=[chunk])
def get_customercare_stats(): """ Generate customer care stats from the Replies table. This gets cached in Redis as a sorted list of contributors, stored as JSON. Example Top Contributor data: [ { 'twitter_username': '******', 'avatar': 'http://twitter.com/path/to/the/avatar.png', 'avatar_https': 'https://twitter.com/path/to/the/avatar.png', 'all': 5211, '1m': 230, '1w': 33, '1d': 3, }, { ... }, { ... }, ] """ if settings.STAGE: return contributor_stats = {} now = datetime.now() one_month_ago = now - timedelta(days=30) one_week_ago = now - timedelta(days=7) yesterday = now - timedelta(days=1) for chunk in chunked(Reply.objects.all(), 2500, Reply.objects.count()): for reply in chunk: user = reply.twitter_username if user not in contributor_stats: raw = json.loads(reply.raw_json) if 'from_user' in raw: # For tweets collected using v1 API user_data = raw else: user_data = raw['user'] contributor_stats[user] = { 'twitter_username': user, 'avatar': user_data['profile_image_url'], 'avatar_https': user_data['profile_image_url_https'], 'all': 0, '1m': 0, '1w': 0, '1d': 0, } contributor = contributor_stats[reply.twitter_username] contributor['all'] += 1 if reply.created > one_month_ago: contributor['1m'] += 1 if reply.created > one_week_ago: contributor['1w'] += 1 if reply.created > yesterday: contributor['1d'] += 1 sort_key = settings.CC_TOP_CONTRIB_SORT limit = settings.CC_TOP_CONTRIB_LIMIT # Sort by whatever is in settings, break ties with 'all' contributor_stats = sorted(contributor_stats.values(), key=lambda c: (c[sort_key], c['all']), reverse=True)[:limit] try: redis = redis_client(name='default') key = settings.CC_TOP_CONTRIB_CACHE_KEY redis.set(key, json.dumps(contributor_stats)) except RedisError as e: statsd.incr('redis.error') log.error('Redis error: %s' % e) return contributor_stats
def handle(self, **options): """ This gets cached in Redis as a sorted list of contributors, stored as JSON. Example Top Contributor data: [ { 'twitter_username': '******', 'avatar': 'http://twitter.com/path/to/the/avatar.png', 'avatar_https': 'https://twitter.com/path/to/the/avatar.png', 'all': 5211, '1m': 230, '1w': 33, '1d': 3, }, { ... }, { ... }, ] """ if settings.STAGE: return contributor_stats = {} now = datetime.now() one_month_ago = now - timedelta(days=30) one_week_ago = now - timedelta(days=7) yesterday = now - timedelta(days=1) for chunk in chunked(Reply.objects.all(), 2500, Reply.objects.count()): for reply in chunk: user = reply.twitter_username if user not in contributor_stats: raw = json.loads(reply.raw_json) if "from_user" in raw: # For tweets collected using v1 API user_data = raw else: user_data = raw["user"] contributor_stats[user] = { "twitter_username": user, "avatar": user_data["profile_image_url"], "avatar_https": user_data["profile_image_url_https"], "all": 0, "1m": 0, "1w": 0, "1d": 0, } contributor = contributor_stats[reply.twitter_username] contributor["all"] += 1 if reply.created > one_month_ago: contributor["1m"] += 1 if reply.created > one_week_ago: contributor["1w"] += 1 if reply.created > yesterday: contributor["1d"] += 1 sort_key = settings.CC_TOP_CONTRIB_SORT limit = settings.CC_TOP_CONTRIB_LIMIT # Sort by whatever is in settings, break ties with 'all' contributor_stats = sorted( list(contributor_stats.values()), key=lambda c: (c[sort_key], c["all"]), reverse=True, )[:limit] try: redis = redis_client(name="default") key = settings.CC_TOP_CONTRIB_CACHE_KEY redis.set(key, json.dumps(contributor_stats)) except RedisError as e: log.error("Redis error: %s" % e) return contributor_stats
def get_customercare_stats(): """ Generate customer care stats from the Replies table. This gets cached in Redis as a sorted list of contributors, stored as JSON. Example Top Contributor data: [ { 'twitter_username': '******', 'avatar': 'http://twitter.com/path/to/the/avatar.png', 'avatar_https': 'https://twitter.com/path/to/the/avatar.png', 'all': 5211, '1m': 230, '1w': 33, '1d': 3, }, { ... }, { ... }, ] """ if settings.STAGE: print ('Skipped get_customercare_stats(). ' 'Set settings.STAGE to False to run it for real.') return contributor_stats = {} now = datetime.now() one_month_ago = now - timedelta(days=30) one_week_ago = now - timedelta(days=7) yesterday = now - timedelta(days=1) for chunk in chunked(Reply.objects.all(), 2500, Reply.objects.count()): for reply in chunk: user = reply.twitter_username if user not in contributor_stats: raw = json.loads(reply.raw_json) if 'from_user' in raw: # For tweets collected using v1 API user_data = raw else: user_data = raw['user'] contributor_stats[user] = { 'twitter_username': user, 'avatar': user_data['profile_image_url'], 'avatar_https': user_data['profile_image_url_https'], 'all': 0, '1m': 0, '1w': 0, '1d': 0, } contributor = contributor_stats[reply.twitter_username] contributor['all'] += 1 if reply.created > one_month_ago: contributor['1m'] += 1 if reply.created > one_week_ago: contributor['1w'] += 1 if reply.created > yesterday: contributor['1d'] += 1 sort_key = settings.CC_TOP_CONTRIB_SORT limit = settings.CC_TOP_CONTRIB_LIMIT # Sort by whatever is in settings, break ties with 'all' contributor_stats = sorted(contributor_stats.values(), key=lambda c: (c[sort_key], c['all']), reverse=True)[:limit] try: redis = redis_client(name='default') key = settings.CC_TOP_CONTRIB_CACHE_KEY redis.set(key, json.dumps(contributor_stats)) except RedisError as e: statsd.incr('redis.error') log.error('Redis error: %s' % e) return contributor_stats
def add_short_links(data): """Create short_url's for a list of docs.""" for chunk in chunked(data, 100): _add_short_links_chunked.apply_async(args=[chunk])