def auto_archive_old_questions(): """Archive all questions that were created over 180 days ago""" # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list(Question.objects.filter(is_archived=False) .filter(created__lte=days_180) .values_list('id', flat=True)) if q_ids: log.info('Updating %d questions', len(q_ids)) sql = """ UPDATE questions_question SET is_archived = 1 WHERE id IN (%s) """ % ','.join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) if not transaction.get_connection().in_atomic_block: transaction.commit() if settings.ES_LIVE_INDEXING: try: # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from kitsune.search.utils import chunked for chunk in chunked(q_ids, 100): # Fetch all the documents we need to update. es_docs = get_documents(QuestionMappingType, chunk) log.info('Updating %d index documents', len(es_docs)) documents = [] # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_archived'] = True doc[u'indexed_on'] = int(time.time()) documents.append(doc) QuestionMappingType.bulk_index(documents) except ES_EXCEPTIONS: # Something happened with ES, so let's push index # updating into an index_task which retries when it # fails because of ES issues. index_task.delay(QuestionMappingType, q_ids)
def auto_archive_old_questions(): """Archive all questions that were created over 180 days ago""" # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list( Question.objects.filter(is_archived=False).filter( created__lte=days_180).values_list('id', flat=True)) if q_ids: log.info('Updating %d questions', len(q_ids)) sql = """ UPDATE questions_question SET is_archived = 1 WHERE id IN (%s) """ % ','.join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) transaction.commit_unless_managed() if settings.ES_LIVE_INDEXING: try: # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from kitsune.search.utils import chunked for chunk in chunked(q_ids, 100): # Fetch all the documents we need to update. es_docs = get_documents(QuestionMappingType, chunk) log.info('Updating %d index documents', len(es_docs)) documents = [] # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_archived'] = True doc[u'indexed_on'] = int(time.time()) documents.append(doc) QuestionMappingType.bulk_index(documents) except ES_EXCEPTIONS: # Something happened with ES, so let's push index # updating into an index_task which retries when it # fails because of ES issues. index_task.delay(QuestionMappingType, q_ids)
def test_analyzer_choices(self): """Check that the indexer picked the right analyzer.""" ids = [d.id for d in list(self.docs.values())] docs = es_utils.get_documents(DocumentMappingType, ids) for doc in docs: locale = doc['locale'] eq_(doc['_analyzer'], self.locale_data[locale]['analyzer'])
def test_analyzer_choices(self): """Check that the indexer picked the right analyzer.""" ids = [d.id for d in self.docs.values()] docs = es_utils.get_documents(DocumentMappingType, ids) for doc in docs: locale = doc['locale'] eq_(doc['_analyzer'], self.locale_data[locale]['analyzer'])
def update_question_vote_chunk(data): """Update num_votes_past_week for a number of questions.""" # First we recalculate num_votes_past_week in the db. log.info('Calculating past week votes for %s questions.' % len(data)) ids = ','.join(map(str, data)) sql = """ UPDATE questions_question q SET num_votes_past_week = ( SELECT COUNT(created) FROM questions_questionvote qv WHERE qv.question_id = q.id AND qv.created >= DATE(SUBDATE(NOW(), 7)) ) WHERE q.id IN (%s); """ % ids cursor = connection.cursor() cursor.execute(sql) if not transaction.get_connection().in_atomic_block: transaction.commit() # Next we update our index with the changes we made directly in # the db. if data and settings.ES_LIVE_INDEXING: # Get the data we just updated from the database. sql = """ SELECT id, num_votes_past_week FROM questions_question WHERE id in (%s); """ % ids cursor = connection.cursor() cursor.execute(sql) # Since this returns (id, num_votes_past_week) tuples, we can # convert that directly to a dict. id_to_num = dict(cursor.fetchall()) try: # Fetch all the documents we need to update. from kitsune.questions.models import QuestionMappingType from kitsune.search import es_utils es_docs = es_utils.get_documents(QuestionMappingType, data) # For each document, update the data and stick it back in the # index. for doc in es_docs: # Note: Need to keep this in sync with # Question.extract_document. num = id_to_num[int(doc[u'id'])] doc[u'question_num_votes_past_week'] = num QuestionMappingType.index(doc, id_=doc['id']) except ES_EXCEPTIONS: # Something happened with ES, so let's push index updating # into an index_task which retries when it fails because # of ES issues. index_task.delay(QuestionMappingType, id_to_num.keys())
def change_and_reindex(self, orm, is_archived, is_locked): """Locks all questions that were created over 180 days ago""" # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) assert is_archived != is_locked f = Q(created__lte=days_180) if is_archived: f |= Q(is_locked=True) if is_locked: f |= Q(is_archived=True) # Update the DB (orm.Question.objects.filter(f).update( is_archived=is_archived, is_locked=is_locked)) # Using the efficient .update() of query sets doesn't emit any # signals, so live indexing won't automatically happen. This # does it manually. if settings.ES_LIVE_INDEXING: q_ids = list( orm.Question.objects.filter(f).values_list('id', flat=True)) try: # This is going to process about 200K questions in # production, so it will take a while and stress # everything. To alleviate this stress, it is # divided into chunks. for chunk in chunked(q_ids, 1000): # Fetch all the documents we need to update. es_docs = get_documents(QuestionMappingType, chunk) documents = [] # For each document, update the data and stick it # back in the index. for doc in es_docs: doc[u'question_is_locked'] = is_locked doc[u'question_is_archived'] = is_archived doc[u'indexed_on'] = int(time.time()) documents.append(doc) if documents: QuestionMappingType.bulk_index(documents) except ES_EXCEPTIONS: # Something happened with ES, so let's push index # updating into an index_task which retries when it # fails because of ES issues. index_task.delay(QuestionMappingType, q_ids)
def test_get_documents(self): q = QuestionFactory() self.refresh() docs = es_utils.get_documents(QuestionMappingType, [q.id]) eq_(docs[0]['id'], q.id)
def test_get_documents(self): q = question(save=True) self.refresh() docs = es_utils.get_documents(QuestionMappingType, [q.id]) eq_(docs[0]['id'], q.id)
def handle(self, **options): # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list( Question.objects.filter(is_archived=False).filter( created__lte=days_180).values_list("id", flat=True)) if q_ids: log.info("Updating %d questions", len(q_ids)) sql = """ UPDATE questions_question SET is_archived = 1 WHERE id IN (%s) """ % ",".join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) if not transaction.get_connection().in_atomic_block: transaction.commit() if settings.ES_LIVE_INDEXING: # elastic v7 code: answer_ids = list( Answer.objects.filter(question_id__in=q_ids).values_list( "id", flat=True)) index_objects_bulk.delay("QuestionDocument", q_ids) index_objects_bulk.delay("AnswerDocument", answer_ids) # elastic v2 code: try: # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from kitsune.search.utils import chunked for chunk in chunked(q_ids, 100): # Fetch all the documents we need to update. es_docs = get_documents(QuestionMappingType, chunk) log.info("Updating %d index documents", len(es_docs)) documents = [] # For each document, update the data and stick it # back in the index. for doc in es_docs: doc["question_is_archived"] = True doc["indexed_on"] = int(time.time()) documents.append(doc) QuestionMappingType.bulk_index(documents) except ES_EXCEPTIONS: # Something happened with ES, so let's push index # updating into an index_task which retries when it # fails because of ES issues. index_task.delay(to_class_path(QuestionMappingType), q_ids)
def test_get_documents(self): q = QuestionFactory() self.refresh() docs = es_utils.get_documents(QuestionMappingType, [q.id]) eq_(docs[0]["id"], q.id)