def handle(self, *args, **kwargs): doc_types = get_doc_types() limit = kwargs["limit"] if limit: doc_types = [dt for dt in doc_types if dt.__name__ in limit] progress_msg = "Indexed {progress} out of {count}" for dt in doc_types: self.stdout.write("Reindexing: {}".format(dt.__name__)) model = dt.get_model() before = kwargs["updated_before"] after = kwargs["updated_after"] if before or after: try: qs = model.objects_range(before=before, after=after) except NotImplementedError: print( f"{model} hasn't implemeneted an `updated_column_name` property." "No documents will be indexed of this type.") continue else: qs = model._default_manager.all() total = qs.count() count = kwargs["count"] percentage = kwargs["percentage"] if count: print("Indexing {} documents out of {}".format(count, total)) else: if percentage < 100: count = int(total * percentage / 100) qs = qs[:count] else: count = total print("Indexing {}%, so {} documents out of {}".format( percentage, count, total)) id_list = list(qs.values_list("pk", flat=True)) bulk_count = kwargs["bulk_count"] for x in range(ceil(count / bulk_count)): start = x * bulk_count end = start + bulk_count index_objects_bulk.delay( dt.__name__, id_list[start:end], timeout=kwargs["timeout"], ) if kwargs["print_sql_count"]: print("{} SQL queries executed".format( len(connection.queries))) reset_queries() print( progress_msg.format(progress=min(end, count), count=count))
def handle_question_vote_delete(instance, **kwargs): index_object.delay("QuestionDocument", instance.question_id) index_objects_bulk.delay( "AnswerDocument", list(instance.question.answers.values_list("pk", flat=True)))
def handle_question_save(instance, **kwargs): if not isinstance(instance, Question): return index_object.delay("QuestionDocument", instance.pk) index_objects_bulk.delay( "AnswerDocument", list(instance.answers.values_list("pk", flat=True)))
def handle(self, **options): # Set up logging so it doesn't send Ricky email. logging.basicConfig(level=logging.ERROR) # Get a list of ids of questions we're going to go change. We need # a list of ids so that we can feed it to the update, but then # also know what we need to update in the index. days_180 = datetime.now() - timedelta(days=180) q_ids = list( Question.objects.filter(is_archived=False).filter( created__lte=days_180).values_list("id", flat=True)) if q_ids: log.info("Updating %d questions", len(q_ids)) sql = """ UPDATE questions_question SET is_archived = 1 WHERE id IN (%s) """ % ",".join(map(str, q_ids)) cursor = connection.cursor() cursor.execute(sql) if not transaction.get_connection().in_atomic_block: transaction.commit() if settings.ES_LIVE_INDEXING: # elastic v7 code: answer_ids = list( Answer.objects.filter(question_id__in=q_ids).values_list( "id", flat=True)) index_objects_bulk.delay("QuestionDocument", q_ids) index_objects_bulk.delay("AnswerDocument", answer_ids) # elastic v2 code: try: # So... the first time this runs, it'll handle 160K # questions or so which stresses everything. Thus we # do it in chunks because otherwise this won't work. # # After we've done this for the first time, we can nix # the chunking code. from kitsune.search.utils import chunked for chunk in chunked(q_ids, 100): # Fetch all the documents we need to update. es_docs = get_documents(QuestionMappingType, chunk) log.info("Updating %d index documents", len(es_docs)) documents = [] # For each document, update the data and stick it # back in the index. for doc in es_docs: doc["question_is_archived"] = True doc["indexed_on"] = int(time.time()) documents.append(doc) QuestionMappingType.bulk_index(documents) except ES_EXCEPTIONS: # Something happened with ES, so let's push index # updating into an index_task which retries when it # fails because of ES issues. index_task.delay(to_class_path(QuestionMappingType), q_ids)
def handle_forum_thread_save(instance, **kwargs): index_objects_bulk.delay( "ForumDocument", list(instance.post_set.values_list("pk", flat=True)))
def handle(self, *args, **kwargs): doc_types = get_doc_types() limit = kwargs["limit"] if limit: doc_types = [dt for dt in doc_types if dt.__name__ in limit] progress_msg = "Indexed {progress} out of {count}" for dt in doc_types: self.stdout.write("Reindexing: {}".format(dt.__name__)) model = dt.get_model() before = kwargs["updated_before"] after = kwargs["updated_after"] if before or after: try: qs = model.objects_range(before=before, after=after) except NotImplementedError: print( f"{model} hasn't implemeneted an `updated_column_name` property." "No documents will be indexed of this type.") continue else: qs = model._default_manager.all() total = qs.count() count = kwargs["count"] percentage = kwargs["percentage"] if count: print("Indexing {} documents out of {}".format(count, total)) else: if percentage < 100: count = int(total * percentage / 100) qs = qs[:count] else: count = total print("Indexing {}%, so {} documents out of {}".format( percentage, count, total)) id_list = list(qs.values_list("pk", flat=True)) sql_chunk_size = kwargs["sql_chunk_size"] # slice the list of ids into chunks of `sql_chunk_size` and send a task to celery # to process each chunk. we do this so as to not OOM on celery when processing # tens of thousands of documents for x in range(ceil(count / sql_chunk_size)): start = x * sql_chunk_size end = start + sql_chunk_size index_objects_bulk.delay( dt.__name__, id_list[start:end], timeout=kwargs["timeout"], # elastic_chunk_size determines how many documents get sent to elastic # in each bulk request, the limiting factor here is the performance of # our elastic cluster elastic_chunk_size=kwargs["elastic_chunk_size"], ) if kwargs["print_sql_count"]: print("{} SQL queries executed".format( len(connection.queries))) reset_queries() print( progress_msg.format(progress=min(end, count), count=count))