def test_chunks_are_chopped_by_byte_size(self): self.assertEqual( 100, len( list(helpers._chunk_actions(self.actions, 100000, 1, JSONSerializer())) ), )
def test_chunks_are_chopped_by_chunk_size(self): self.assertEquals( 10, len( list( helpers._chunk_actions(self.actions, 10, 99999999, JSONSerializer()))))
async def bulk(client, actions, chunk_size=500, max_retries=0, max_chunk_bytes=100 * 1024 * 1024, expand_action_callback=expand_action, initial_backoff=2, max_backoff=600, stats_only=False, **kwargs): actions = map(expand_action_callback, actions) finish_count = 0 if stats_only: fail_datas = 0 else: fail_datas = [] chunk_action_iter = _chunk_actions(actions, chunk_size, max_chunk_bytes, client.transport.serializer) for bulk_data, bulk_action in chunk_action_iter: coroutine = _process_bulk(client, bulk_data, bulk_action, **kwargs) count, fails = await _retry_handler(client, coroutine, max_retries, initial_backoff, max_backoff, **kwargs) finish_count += count if stats_only: fail_datas += len(fails) else: fail_datas.extend(fails) return finish_count, fail_datas
def test_chunks_are_chopped_by_byte_size(self): self.assertEquals( 100, len( list(helpers._chunk_actions(self.actions, 100000, 1, JSONSerializer())) ), )
def test_chunks_are_chopped_by_byte_size_properly(self): max_byte_size = 170 chunks = list(helpers._chunk_actions(self.actions, 100000, max_byte_size, JSONSerializer())) self.assertEquals(25, len(chunks)) for chunk_data, chunk_actions in chunks: chunk = u''.join(chunk_actions) chunk = chunk if isinstance(chunk, str) else chunk.encode('utf-8') self.assertLessEqual(len(chunk), max_byte_size)
def test_chunks_are_chopped_by_byte_size_properly(self): max_byte_size = 170 chunks = list( helpers._chunk_actions(self.actions, 100000, max_byte_size, JSONSerializer())) self.assertEqual(25, len(chunks)) for chunk_data, chunk_actions in chunks: chunk = u"".join(chunk_actions) chunk = chunk if isinstance(chunk, str) else chunk.encode("utf-8") self.assertLessEqual(len(chunk), max_byte_size)
def parallel_bulk(client, actions, thread_count=4, chunk_size=500, max_chunk_bytes=100 * 1024 * 1024, expand_action_callback=es_helpers.expand_action, **kwargs): """ es_helpers.parallel_bulk rewritten with imap_fixed_output_buffer instead of Pool.imap, which consumed unbounded memory if the generator outruns the upload (which usually happens). """ actions = map(expand_action_callback, actions) for result in imap_fixed_output_buffer( lambda chunk: list( es_helpers._process_bulk_chunk(client, chunk, **kwargs)), es_helpers._chunk_actions(actions, chunk_size, max_chunk_bytes, client.transport.serializer), threads=thread_count, ): for item in result: yield item
async def concurrency_bulk(client, actions, concurrency_count=4, chunk_size=500, max_retries=0, max_chunk_bytes=100 * 1024 * 1024, expand_action_callback=expand_action, initial_backoff=2, max_backoff=600, **kwargs): async def concurrency_wrapper(action_iter): p_count = p_fails = 0 for bulk_data, bulk_action in action_iter: coroutine = _process_bulk(client, bulk_data, bulk_action, **kwargs) count, fails = await _retry_handler(client, coroutine, max_retries, initial_backoff, max_backoff, **kwargs) p_count += count p_fails += len(fails) return p_count, p_fails actions = map(expand_action_callback, actions) chunk_action_iter = _chunk_actions(actions, chunk_size, max_chunk_bytes, client.transport.serializer) tasks = [] for i in range(concurrency_count): tasks.append(concurrency_wrapper(chunk_action_iter)) results = await asyncio.gather(*tasks, loop=client.loop) finish_count = 0 fail_count = 0 for p_finish, p_fail in results: finish_count += p_finish fail_count += p_fail return finish_count, fail_count
async def streaming_bulk(client, actions, chunk_size=500, max_chunk_bytes=100 * 1024 * 1024, raise_on_error=True, expand_action_callback=expand_action, raise_on_exception=True, max_retries=0, initial_backoff=2, max_backoff=600, yield_ok=True, *args, **kwargs): """ Streaming bulk consumes actions from the iterable passed in and yields results per action. For non-streaming usecases use :func:`~elasticsearch.helpers.bulk` which is a wrapper around streaming bulk that returns summary information about the bulk operation once the entire input is consumed and sent. If you specify ``max_retries`` it will also retry any documents that were rejected with a ``429`` status code. To do this it will wait (**by calling time.sleep which will block**) for ``initial_backoff`` seconds and then, every subsequent rejection for the same chunk, for double the time every time up to ``max_backoff`` seconds. :arg client: instance of :class:`~elasticsearch.Elasticsearch` to use :arg actions: iterable containing the actions to be executed :arg chunk_size: number of docs in one chunk sent to es (default: 500) :arg max_chunk_bytes: the maximum size of the request in bytes (default: 100MB) :arg raise_on_error: raise ``BulkIndexError`` containing errors (as `.errors`) from the execution of the last chunk when some occur. By default we raise. :arg raise_on_exception: if ``False`` then don't propagate exceptions from call to ``bulk`` and just report the items that failed as failed. :arg expand_action_callback: callback executed on each action passed in, should return a tuple containing the action line and the data line (`None` if data line should be omitted). :arg max_retries: maximum number of times a document will be retried when ``429`` is received, set to 0 (default) for no retries on ``429`` :arg initial_backoff: number of seconds we should wait before the first retry. Any subsequent retries will be powers of ``initial_backoff * 2**retry_number`` :arg max_backoff: maximum number of seconds a retry will wait :arg yield_ok: if set to False will skip successful documents in the output """ actions = map(expand_action_callback, actions) for bulk_data, bulk_actions in _chunk_actions(actions, chunk_size, max_chunk_bytes, client.transport.serializer): for attempt in range(max_retries + 1): to_retry, to_retry_data = [], [] if attempt: await asyncio.sleep( min(max_backoff, initial_backoff * 2**(attempt - 1))) try: async for ok, info in _process_bulk_chunk( client, bulk_actions, bulk_data, raise_on_exception, raise_on_error, *args, **kwargs): if not ok: action, info = info.popitem() # retry if retries enabled, we get 429, and we are not # in the last attempt if max_retries \ and info['status'] == 429 \ and (attempt + 1) <= max_retries: # _process_bulk_chunk expects strings so we need to # re-serialize the data to_retry.extend( map(client.transport.serializer.dumps, bulk_data)) to_retry_data.append(bulk_data) else: yield ok, {action: info} elif yield_ok: yield ok, info except TransportError as e: # suppress 429 errors since we will retry them if attempt == max_retries or e.status_code != 429: raise else: if not to_retry: break # retry only subset of documents that didn't succeed bulk_actions, bulk_data = to_retry, to_retry_data