def index_items(documents, object_type, **kwargs): """ Index items based on list of item ids Args: documents (iterable of dict): An iterable with ElasticSearch documents to index object_type (str): the ES object type """ conn = get_conn() # bulk will also break an iterable into chunks. However we should do this here so that # we can use the same documents when indexing to multiple aliases. for chunk in chunks(documents, chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE): for alias in get_active_aliases(conn, [object_type]): _, errors = bulk( conn, chunk, index=alias, doc_type=GLOBAL_DOC_TYPE, chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, **kwargs, ) if len(errors) > 0: raise ReindexException( f"Error during bulk {object_type} insert: {errors}")
def populate_subscriptions_and_roles(self): """Populate channel roles and subscriptions for all users and channels""" results = celery.group([ populate_user_subscriptions.si(ids) for ids in chunks( User.objects.exclude( username=settings.INDEXING_API_USERNAME).exclude( profile__isnull=True).order_by("id").values_list( "id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ populate_user_roles.si(ids) for ids in chunks( Channel.objects.order_by("id").values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ]) raise self.replace(results)
def test_chunks(): """ test for chunks """ input_list = list(range(113)) output_list = [] for nums in chunks(input_list): output_list += nums assert output_list == input_list output_list = [] for nums in chunks(input_list, chunk_size=1): output_list += nums assert output_list == input_list output_list = [] for nums in chunks(input_list, chunk_size=124): output_list += nums assert output_list == input_list
def populate_channel_fields(self): """ Populates Channel fields from reddit for all channels """ results = celery.group([ populate_channel_fields_batch.si(ids) for ids in chunks( Channel.objects.values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ]) raise self.replace(results)
def populate_post_and_comment_fields(self): """ Populates Post fields """ results = celery.group([ populate_post_and_comment_fields_batch.si(ids) for ids in chunks( Post.objects.order_by("id").values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ]) raise self.replace(results)
def send_unsent_email_notifications(): """ Send all notifications that haven't been sent yet """ for notification_ids in chunks( EmailNotification.objects.filter( state=EmailNotification.STATE_PENDING).values_list("id", flat=True), chunk_size=settings.NOTIFICATION_SEND_CHUNK_SIZE, ): EmailNotification.objects.filter(id__in=notification_ids).update( state=EmailNotification.STATE_SENDING) tasks.send_email_notification_batch.delay(notification_ids)
def test_chunks_iterable(): """ test that chunks works on non-list iterables too """ count = 113 input_range = range(count) chunk_output = [] for chunk in chunks(input_range, chunk_size=10): chunk_output.append(chunk) assert len(chunk_output) == ceil(113 / 10) range_list = [] for chunk in chunk_output: range_list += chunk assert range_list == list(range(count))
def _gen_attempt_send_notification_batches(notification_settings): """ Generates the set of attempt_send_notification_batch tasks in a fan-out structure Args: notification_settings (iterable of NotificationSettings): an iterable of NotificationSettings to attempt the sends for Returns: celery.group: the celery group of tasks to execute """ return celery.group([ attempt_send_notification_batch.si(notification_settings_ids) for notification_settings_ids in chunks( notification_settings, chunk_size=settings.NOTIFICATION_ATTEMPT_CHUNK_SIZE, ) ])
def test_bulk_index_content_files(mocked_es, mocker, settings, errors, indexing_func_name, doc): # pylint: disable=too-many-arguments """ index functions for content files should call bulk with correct arguments """ settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE = 3 course = CourseFactory.create() run = LearningResourceRunFactory.create(content_object=course) content_files = ContentFileFactory.create_batch(5, run=run) mock_get_aliases = mocker.patch("search.indexing_api.get_active_aliases", autospec=True, return_value=["a", "b"]) bulk_mock = mocker.patch("search.indexing_api.bulk", autospec=True, return_value=(0, errors)) mocker.patch( f"search.indexing_api.serialize_content_file_for_bulk", autospec=True, return_value=doc, ) mocker.patch( f"search.indexing_api.serialize_content_file_for_bulk_deletion", autospec=True, return_value=doc, ) index_func = getattr(indexing_api, indexing_func_name) if errors: with pytest.raises(ReindexException): index_func(run.id) else: index_func(run.id) for alias in mock_get_aliases.return_value: for chunk in chunks( [doc for _ in content_files], chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ): bulk_mock.assert_any_call( mocked_es.conn, chunk, index=alias, doc_type=GLOBAL_DOC_TYPE, chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, routing=gen_course_id(course.platform, course.course_id), )
def populate_all_posts_and_comments(self): """ Backpopulate all posts and comments """ reddit_api = get_admin_api().reddit # fetch and base36 decode the latest post id newest_post_id = base36.loads(next(reddit_api.front.new()).id) # create a celery chord by batching a backpopulate and merging results results = (celery.group( populate_posts_and_comments.si(post_ids) for post_ids in chunks( range(newest_post_id + 1), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, )) | populate_posts_and_comments_merge_results.s()) raise self.replace(results)
def subscribe_all_users_to_channels(self, *, channel_names): """ Subscribes all users to a set of channels Args: channel_names (list of str): the names of the channels to subscribe to """ chunk_size = settings.OPEN_DISCUSSIONS_DEFAULT_CHANNEL_BACKPOPULATE_BATCH_SIZE query = (User.objects.exclude(username=settings.INDEXING_API_USERNAME). order_by("username").values_list("username", flat=True).iterator()) results = celery.group([ subscribe_user_range_to_channels.si(channel_names=channel_names, usernames=usernames) for usernames in chunks(query, chunk_size=chunk_size) ]) raise self.replace(results)
def test_index_functions(mocked_es, mocker, settings, errors, indexing_func_name, serializing_func_name): # pylint: disable=too-many-arguments """ index functions should call bulk with correct arguments """ settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE = 3 documents = ["doc1", "doc2", "doc3", "doc4", "doc5"] mock_get_aliases = mocker.patch("search.indexing_api.get_active_aliases", autospec=True, return_value=["a", "b"]) mocker.patch( f"search.indexing_api.{serializing_func_name}", autospec=True, return_value=(doc for doc in documents), ) bulk_mock = mocker.patch("search.indexing_api.bulk", autospec=True, return_value=(0, errors)) index_func = getattr(indexing_api, indexing_func_name) if errors: with pytest.raises(ReindexException): index_func([1, 2, 3]) else: index_func([1, 2, 3]) for alias in mock_get_aliases.return_value: for chunk in chunks( documents, chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE): bulk_mock.assert_any_call( mocked_es.conn, chunk, index=alias, doc_type=GLOBAL_DOC_TYPE, chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, )
def start_recreate_index(self): """ Wipe and recreate index and mapping, and index all items. """ try: new_backing_indices = { obj_type: api.create_backing_index(obj_type) for obj_type in VALID_OBJECT_TYPES } # Do the indexing on the temp index log.info( "starting to index all posts, comments, profiles, and course catalog objects..." ) blacklisted_ids = load_course_blacklist() index_tasks = celery.group([ index_posts.si(post_ids) for post_ids in chunks( Post.objects.order_by("id").values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ index_comments.si(comment_ids) for comment_ids in chunks( Comment.objects.order_by("id").values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ index_profiles.si(ids) for ids in chunks( User.objects.exclude( username=settings.INDEXING_API_USERNAME).exclude( profile__isnull=True).filter(is_active=True).order_by( "id").values_list("profile__id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ index_courses.si(ids) for ids in chunks( Course.objects.filter(published=True).exclude( course_id__in=blacklisted_ids).order_by("id").values_list( "id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ index_course_content_files.si(ids) for ids in chunks( Course.objects.filter(published=True).filter(platform__in=( PlatformType.ocw.value, PlatformType.xpro.value)).exclude( course_id__in=blacklisted_ids).order_by( "id").values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ index_bootcamps.si(ids) for ids in chunks( Bootcamp.objects.filter(published=True).order_by( "id").values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ index_programs.si(ids) for ids in chunks( Program.objects.filter(published=True).order_by( "id").values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ index_user_lists.si(ids) for ids in chunks( UserList.objects.order_by("id").exclude( items=None).values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ] + [ index_videos.si(ids) for ids in chunks( Video.objects.filter(published=True).order_by( "id").values_list("id", flat=True), chunk_size=settings.ELASTICSEARCH_INDEXING_CHUNK_SIZE, ) ]) except: # pylint: disable=bare-except error = "start_recreate_index threw an error" log.exception(error) return error # Use self.replace so that code waiting on this task will also wait on the indexing and finish tasks raise self.replace( celery.chain(index_tasks, finish_recreate_index.s(new_backing_indices)))