async def test_documents_data_types(self, async_client): async def async_gen(): for x in range(100): await asyncio.sleep(0) yield {"answer": x, "_id": x} def sync_gen(): for x in range(100): yield {"answer": x, "_id": x} async for ok, item in helpers.async_streaming_bulk( async_client, async_gen(), index="test-index", refresh=True ): assert ok assert 100 == (await async_client.count(index="test-index"))["count"] assert {"answer": 42} == (await async_client.get(index="test-index", id=42))[ "_source" ] await async_client.delete_by_query( index="test-index", body={"query": {"match_all": {}}} ) async for ok, item in helpers.async_streaming_bulk( async_client, sync_gen(), index="test-index", refresh=True ): assert ok assert 100 == (await async_client.count(index="test-index"))["count"] assert {"answer": 42} == (await async_client.get(index="test-index", id=42))[ "_source" ]
async def async_streaming_bulk_types() -> None: async for _ in async_streaming_bulk(es, async_gen()): pass async for _ in async_streaming_bulk(es, async_gen().__aiter__()): pass async for _ in async_streaming_bulk(es, [{}]): pass async for _ in async_streaming_bulk(es, ({},)): pass
async def test_actions_remain_unchanged(self, async_client): actions = [{"_id": 1}, {"_id": 2}] async for ok, item in helpers.async_streaming_bulk( async_client, actions, index="test-index" ): assert ok assert [{"_id": 1}, {"_id": 2}] == actions
async def test_rejected_documents_are_retried_at_most_max_retries_times( self, async_client ): failing_client = FailingBulkClient( async_client, fail_at=(1, 2), fail_with=TransportError(429, "Rejected!", {}) ) docs = [ {"_index": "i", "_id": 47, "f": "v"}, {"_index": "i", "_id": 45, "f": "v"}, {"_index": "i", "_id": 42, "f": "v"}, ] results = [ x async for x in helpers.async_streaming_bulk( failing_client, docs, raise_on_exception=False, raise_on_error=False, chunk_size=1, max_retries=1, initial_backoff=0, ) ] assert 3 == len(results) assert [False, True, True] == [r[0] for r in results] await async_client.indices.refresh(index="i") res = await async_client.search(index="i") assert {"value": 2, "relation": "eq"} == res["hits"]["total"] assert 4 == failing_client._called
async def test_transport_error_can_becaught(self, async_client): failing_client = FailingBulkClient(async_client) docs = [ {"_index": "i", "_id": 47, "f": "v"}, {"_index": "i", "_id": 45, "f": "v"}, {"_index": "i", "_id": 42, "f": "v"}, ] results = [ x async for x in helpers.async_streaming_bulk( failing_client, docs, raise_on_exception=False, raise_on_error=False, chunk_size=1, ) ] assert 3 == len(results) assert [True, False, True] == [r[0] for r in results] exc = results[1][1]["index"].pop("exception") assert isinstance(exc, TransportError) assert 599 == exc.status_code assert { "index": { "_index": "i", "_id": 45, "data": {"f": "v"}, "error": "TransportError(599, 'Error!')", "status": 599, } } == results[1][1]
async def test_different_op_types(self, async_client): await async_client.index(index="i", id=45, body={}) await async_client.index(index="i", id=42, body={}) docs = [ { "_index": "i", "_id": 47, "f": "v" }, { "_op_type": "delete", "_index": "i", "_id": 45 }, { "_op_type": "update", "_index": "i", "_id": 42, "doc": { "answer": 42 } }, ] async for ok, item in helpers.async_streaming_bulk(async_client, docs): assert ok assert not await async_client.exists(index="i", id=45) assert { "answer": 42 } == (await async_client.get(index="i", id=42))["_source"] assert { "f": "v" } == (await async_client.get(index="i", id=47))["_source"]
async def test_all_errors_from_chunk_are_raised_on_failure( self, async_client): await async_client.indices.create( "i", { "mappings": { "properties": { "a": { "type": "integer" } } }, "settings": { "number_of_shards": 1, "number_of_replicas": 0 }, }, ) await async_client.cluster.health(wait_for_status="yellow") try: async for ok, item in helpers.async_streaming_bulk( async_client, [{ "a": "b" }, { "a": "c" }], index="i", raise_on_error=True): assert ok except helpers.BulkIndexError as e: assert 2 == len(e.errors) else: assert False, "exception should have been raised"
async def process(self, **kwargs): async for ok, result in async_streaming_bulk(self.es_client.es, self.get_next_doc()): self.tqdm.update(self.parser.file_stream.fileobj.tell() - self.tqdm.n) self.tqdm_etree.update() action, result = result.popitem() if not ok: print("failed to %s document %s" % ())
async def ingest(): if not (await es.indices.exists(index="games")): await es.indices.create(index="games") async for _ in async_streaming_bulk(client=es, index="games", actions=download_games_db()): pass return {"status": "ok"}
async def claim_consumer(self, claim_producer): touched = set() async for ok, item in async_streaming_bulk(self.sync_client, self._consume_claim_producer(claim_producer), raise_on_error=False): if not ok: self.logger.warning("indexing failed for an item: %s", item) else: item = item.popitem()[1] touched.add(item['_id']) await self.sync_client.indices.refresh(self.index) self.logger.info("Indexing done.")
async def test_all_documents_get_inserted(self, async_client): docs = [{"answer": x, "_id": x} for x in range(100)] async for ok, item in helpers.async_streaming_bulk( async_client, docs, index="test-index", refresh=True ): assert ok assert 100 == (await async_client.count(index="test-index"))["count"] assert {"answer": 42} == (await async_client.get(index="test-index", id=42))[ "_source" ]
async def streaming_bulk(): results = [ x async for x in helpers.async_streaming_bulk( failing_client, [{"a": 42}, {"a": 39}], raise_on_exception=True, max_retries=3, initial_backoff=0, ) ] return results
async def make_es_index_and_run_sync(env: Env, clients=32, force=False, db=None, index_name='claims'): index = SearchIndex(env.es_index_prefix, elastic_host=env.elastic_host, elastic_port=env.elastic_port) logging.info("ES sync host: %s:%i", env.elastic_host, env.elastic_port) try: created = await index.start() except IndexVersionMismatch as err: logging.info( "dropping ES search index (version %s) for upgrade to version %s", err.got_version, err.expected_version) await index.delete_index() await index.stop() created = await index.start() finally: index.stop() es = AsyncElasticsearch([{ 'host': env.elastic_host, 'port': env.elastic_port }]) if force or created: claim_generator = get_all_claims(env, index_name=index_name, db=db) else: claim_generator = get_recent_claims(env, index_name=index_name, db=db) try: async for ok, item in async_streaming_bulk(es, claim_generator, request_timeout=600, raise_on_error=False): if not ok: logging.warning("indexing failed for an item: %s", item) await es.indices.refresh(index=index_name) finally: await es.close()
async def update_trending_score(self, params): update_trending_score_script = """ double softenLBC(double lbc) { return (Math.pow(lbc, 1.0 / 3.0)); } double logsumexp(double x, double y) { double top; if(x > y) top = x; else top = y; double result = top + Math.log(Math.exp(x-top) + Math.exp(y-top)); return(result); } double logdiffexp(double big, double small) { return big + Math.log(1.0 - Math.exp(small - big)); } double squash(double x) { if(x < 0.0) return -Math.log(1.0 - x); else return Math.log(x + 1.0); } double unsquash(double x) { if(x < 0.0) return 1.0 - Math.exp(-x); else return Math.exp(x) - 1.0; } double log_to_squash(double x) { return logsumexp(x, 0.0); } double squash_to_log(double x) { //assert x > 0.0; return logdiffexp(x, 0.0); } double squashed_add(double x, double y) { // squash(unsquash(x) + unsquash(y)) but avoiding overflow. // Cases where the signs are the same if (x < 0.0 && y < 0.0) return -logsumexp(-x, logdiffexp(-y, 0.0)); if (x >= 0.0 && y >= 0.0) return logsumexp(x, logdiffexp(y, 0.0)); // Where the signs differ if (x >= 0.0 && y < 0.0) if (Math.abs(x) >= Math.abs(y)) return logsumexp(0.0, logdiffexp(x, -y)); else return -logsumexp(0.0, logdiffexp(-y, x)); if (x < 0.0 && y >= 0.0) { // Addition is commutative, hooray for new math return squashed_add(y, x); } return 0.0; } double squashed_multiply(double x, double y) { // squash(unsquash(x)*unsquash(y)) but avoiding overflow. int sign; if(x*y >= 0.0) sign = 1; else sign = -1; return sign*logsumexp(squash_to_log(Math.abs(x)) + squash_to_log(Math.abs(y)), 0.0); } // Squashed inflated units double inflateUnits(int height) { double timescale = 576.0; // Half life of 400 = e-folding time of a day // by coincidence, so may as well go with it return log_to_squash(height / timescale); } double spikePower(double newAmount) { if (newAmount < 50.0) { return(0.5); } else if (newAmount < 85.0) { return(newAmount / 100.0); } else { return(0.85); } } double spikeMass(double oldAmount, double newAmount) { double softenedChange = softenLBC(Math.abs(newAmount - oldAmount)); double changeInSoftened = Math.abs(softenLBC(newAmount) - softenLBC(oldAmount)); double power = spikePower(newAmount); if (oldAmount > newAmount) { -1.0 * Math.pow(changeInSoftened, power) * Math.pow(softenedChange, 1.0 - power) } else { Math.pow(changeInSoftened, power) * Math.pow(softenedChange, 1.0 - power) } } for (i in params.src.changes) { double units = inflateUnits(i.height); if (ctx._source.trending_score == null) { ctx._source.trending_score = 0.0; } double bigSpike = squashed_multiply(units, squash(spikeMass(i.prev_amount, i.new_amount))); ctx._source.trending_score = squashed_add(ctx._source.trending_score, bigSpike); } """ start = time.perf_counter() def producer(): for claim_id, claim_updates in params.items(): yield { '_id': claim_id, '_index': self.index, '_op_type': 'update', 'script': { 'lang': 'painless', 'source': update_trending_score_script, 'params': { 'src': { 'changes': [{ 'height': p.height, 'prev_amount': p.prev_amount / 1E8, 'new_amount': p.new_amount / 1E8, } for p in claim_updates] } } }, } if not params: return async for ok, item in async_streaming_bulk(self.sync_client, producer(), raise_on_error=False): if not ok: self.logger.warning("updating trending failed for an item: %s", item) await self.sync_client.indices.refresh(self.index) self.logger.info("updated trending scores in %ims", int((time.perf_counter() - start) * 1000))