Esempio n. 1
0
    async def test_documents_data_types(self, async_client):
        async def async_gen():
            for x in range(100):
                await asyncio.sleep(0)
                yield {"answer": x, "_id": x}

        def sync_gen():
            for x in range(100):
                yield {"answer": x, "_id": x}

        async for ok, item in helpers.async_streaming_bulk(
            async_client, async_gen(), index="test-index", refresh=True
        ):
            assert ok

        assert 100 == (await async_client.count(index="test-index"))["count"]
        assert {"answer": 42} == (await async_client.get(index="test-index", id=42))[
            "_source"
        ]

        await async_client.delete_by_query(
            index="test-index", body={"query": {"match_all": {}}}
        )

        async for ok, item in helpers.async_streaming_bulk(
            async_client, sync_gen(), index="test-index", refresh=True
        ):
            assert ok

        assert 100 == (await async_client.count(index="test-index"))["count"]
        assert {"answer": 42} == (await async_client.get(index="test-index", id=42))[
            "_source"
        ]
async def async_streaming_bulk_types() -> None:
    async for _ in async_streaming_bulk(es, async_gen()):
        pass
    async for _ in async_streaming_bulk(es, async_gen().__aiter__()):
        pass
    async for _ in async_streaming_bulk(es, [{}]):
        pass
    async for _ in async_streaming_bulk(es, ({},)):
        pass
Esempio n. 3
0
 async def test_actions_remain_unchanged(self, async_client):
     actions = [{"_id": 1}, {"_id": 2}]
     async for ok, item in helpers.async_streaming_bulk(
         async_client, actions, index="test-index"
     ):
         assert ok
     assert [{"_id": 1}, {"_id": 2}] == actions
Esempio n. 4
0
    async def test_rejected_documents_are_retried_at_most_max_retries_times(
        self, async_client
    ):
        failing_client = FailingBulkClient(
            async_client, fail_at=(1, 2), fail_with=TransportError(429, "Rejected!", {})
        )

        docs = [
            {"_index": "i", "_id": 47, "f": "v"},
            {"_index": "i", "_id": 45, "f": "v"},
            {"_index": "i", "_id": 42, "f": "v"},
        ]
        results = [
            x
            async for x in helpers.async_streaming_bulk(
                failing_client,
                docs,
                raise_on_exception=False,
                raise_on_error=False,
                chunk_size=1,
                max_retries=1,
                initial_backoff=0,
            )
        ]
        assert 3 == len(results)
        assert [False, True, True] == [r[0] for r in results]
        await async_client.indices.refresh(index="i")
        res = await async_client.search(index="i")
        assert {"value": 2, "relation": "eq"} == res["hits"]["total"]
        assert 4 == failing_client._called
Esempio n. 5
0
    async def test_transport_error_can_becaught(self, async_client):
        failing_client = FailingBulkClient(async_client)
        docs = [
            {"_index": "i", "_id": 47, "f": "v"},
            {"_index": "i", "_id": 45, "f": "v"},
            {"_index": "i", "_id": 42, "f": "v"},
        ]

        results = [
            x
            async for x in helpers.async_streaming_bulk(
                failing_client,
                docs,
                raise_on_exception=False,
                raise_on_error=False,
                chunk_size=1,
            )
        ]
        assert 3 == len(results)
        assert [True, False, True] == [r[0] for r in results]

        exc = results[1][1]["index"].pop("exception")
        assert isinstance(exc, TransportError)
        assert 599 == exc.status_code
        assert {
            "index": {
                "_index": "i",
                "_id": 45,
                "data": {"f": "v"},
                "error": "TransportError(599, 'Error!')",
                "status": 599,
            }
        } == results[1][1]
Esempio n. 6
0
    async def test_different_op_types(self, async_client):
        await async_client.index(index="i", id=45, body={})
        await async_client.index(index="i", id=42, body={})
        docs = [
            {
                "_index": "i",
                "_id": 47,
                "f": "v"
            },
            {
                "_op_type": "delete",
                "_index": "i",
                "_id": 45
            },
            {
                "_op_type": "update",
                "_index": "i",
                "_id": 42,
                "doc": {
                    "answer": 42
                }
            },
        ]
        async for ok, item in helpers.async_streaming_bulk(async_client, docs):
            assert ok

        assert not await async_client.exists(index="i", id=45)
        assert {
            "answer": 42
        } == (await async_client.get(index="i", id=42))["_source"]
        assert {
            "f": "v"
        } == (await async_client.get(index="i", id=47))["_source"]
Esempio n. 7
0
    async def test_all_errors_from_chunk_are_raised_on_failure(
            self, async_client):
        await async_client.indices.create(
            "i",
            {
                "mappings": {
                    "properties": {
                        "a": {
                            "type": "integer"
                        }
                    }
                },
                "settings": {
                    "number_of_shards": 1,
                    "number_of_replicas": 0
                },
            },
        )
        await async_client.cluster.health(wait_for_status="yellow")

        try:
            async for ok, item in helpers.async_streaming_bulk(
                    async_client, [{
                        "a": "b"
                    }, {
                        "a": "c"
                    }],
                    index="i",
                    raise_on_error=True):
                assert ok
        except helpers.BulkIndexError as e:
            assert 2 == len(e.errors)
        else:
            assert False, "exception should have been raised"
 async def process(self, **kwargs):
     async for ok, result in async_streaming_bulk(self.es_client.es,
                                                  self.get_next_doc()):
         self.tqdm.update(self.parser.file_stream.fileobj.tell() -
                          self.tqdm.n)
         self.tqdm_etree.update()
         action, result = result.popitem()
         if not ok:
             print("failed to %s document %s" % ())
Esempio n. 9
0
async def ingest():
    if not (await es.indices.exists(index="games")):
        await es.indices.create(index="games")

    async for _ in async_streaming_bulk(client=es,
                                        index="games",
                                        actions=download_games_db()):
        pass

    return {"status": "ok"}
Esempio n. 10
0
 async def claim_consumer(self, claim_producer):
     touched = set()
     async for ok, item in async_streaming_bulk(self.sync_client, self._consume_claim_producer(claim_producer),
                                                raise_on_error=False):
         if not ok:
             self.logger.warning("indexing failed for an item: %s", item)
         else:
             item = item.popitem()[1]
             touched.add(item['_id'])
     await self.sync_client.indices.refresh(self.index)
     self.logger.info("Indexing done.")
Esempio n. 11
0
    async def test_all_documents_get_inserted(self, async_client):
        docs = [{"answer": x, "_id": x} for x in range(100)]
        async for ok, item in helpers.async_streaming_bulk(
            async_client, docs, index="test-index", refresh=True
        ):
            assert ok

        assert 100 == (await async_client.count(index="test-index"))["count"]
        assert {"answer": 42} == (await async_client.get(index="test-index", id=42))[
            "_source"
        ]
Esempio n. 12
0
 async def streaming_bulk():
     results = [
         x
         async for x in helpers.async_streaming_bulk(
             failing_client,
             [{"a": 42}, {"a": 39}],
             raise_on_exception=True,
             max_retries=3,
             initial_backoff=0,
         )
     ]
     return results
Esempio n. 13
0
async def make_es_index_and_run_sync(env: Env,
                                     clients=32,
                                     force=False,
                                     db=None,
                                     index_name='claims'):
    index = SearchIndex(env.es_index_prefix,
                        elastic_host=env.elastic_host,
                        elastic_port=env.elastic_port)
    logging.info("ES sync host: %s:%i", env.elastic_host, env.elastic_port)
    try:
        created = await index.start()
    except IndexVersionMismatch as err:
        logging.info(
            "dropping ES search index (version %s) for upgrade to version %s",
            err.got_version, err.expected_version)
        await index.delete_index()
        await index.stop()
        created = await index.start()
    finally:
        index.stop()

    es = AsyncElasticsearch([{
        'host': env.elastic_host,
        'port': env.elastic_port
    }])
    if force or created:
        claim_generator = get_all_claims(env, index_name=index_name, db=db)
    else:
        claim_generator = get_recent_claims(env, index_name=index_name, db=db)
    try:
        async for ok, item in async_streaming_bulk(es,
                                                   claim_generator,
                                                   request_timeout=600,
                                                   raise_on_error=False):
            if not ok:
                logging.warning("indexing failed for an item: %s", item)
        await es.indices.refresh(index=index_name)
    finally:
        await es.close()
Esempio n. 14
0
    async def update_trending_score(self, params):
        update_trending_score_script = """
        double softenLBC(double lbc) { return (Math.pow(lbc, 1.0 / 3.0)); }

        double logsumexp(double x, double y)
        {
            double top;
            if(x > y)
                top = x;
            else
                top = y;
            double result = top + Math.log(Math.exp(x-top) + Math.exp(y-top));
            return(result);
        }

        double logdiffexp(double big, double small)
        {
            return big + Math.log(1.0 - Math.exp(small - big));
        }

        double squash(double x)
        {
            if(x < 0.0)
                return -Math.log(1.0 - x);
            else
                return Math.log(x + 1.0);
        }

        double unsquash(double x)
        {
            if(x < 0.0)
                return 1.0 - Math.exp(-x);
            else
                return Math.exp(x) - 1.0;
        }

        double log_to_squash(double x)
        {
            return logsumexp(x, 0.0);
        }

        double squash_to_log(double x)
        {
            //assert x > 0.0;
            return logdiffexp(x, 0.0);
        }

        double squashed_add(double x, double y)
        {
            // squash(unsquash(x) + unsquash(y)) but avoiding overflow.
            // Cases where the signs are the same
            if (x < 0.0 && y < 0.0)
                return -logsumexp(-x, logdiffexp(-y, 0.0));
            if (x >= 0.0 && y >= 0.0)
                return logsumexp(x, logdiffexp(y, 0.0));
            // Where the signs differ
            if (x >= 0.0 && y < 0.0)
                if (Math.abs(x) >= Math.abs(y))
                    return logsumexp(0.0, logdiffexp(x, -y));
                else
                    return -logsumexp(0.0, logdiffexp(-y, x));
            if (x < 0.0 && y >= 0.0)
            {
                // Addition is commutative, hooray for new math
                return squashed_add(y, x);
            }
            return 0.0;
        }

        double squashed_multiply(double x, double y)
        {
            // squash(unsquash(x)*unsquash(y)) but avoiding overflow.
            int sign;
            if(x*y >= 0.0)
                sign = 1;
            else
                sign = -1;
            return sign*logsumexp(squash_to_log(Math.abs(x))
                            + squash_to_log(Math.abs(y)), 0.0);
        }

        // Squashed inflated units
        double inflateUnits(int height) {
            double timescale = 576.0; // Half life of 400 = e-folding time of a day
                                      // by coincidence, so may as well go with it
            return log_to_squash(height / timescale);
        }

        double spikePower(double newAmount) {
            if (newAmount < 50.0) {
                return(0.5);
            } else if (newAmount < 85.0) {
                return(newAmount / 100.0);
            } else {
                return(0.85);
            }
        }

        double spikeMass(double oldAmount, double newAmount) {
            double softenedChange = softenLBC(Math.abs(newAmount - oldAmount));
            double changeInSoftened = Math.abs(softenLBC(newAmount) - softenLBC(oldAmount));
            double power = spikePower(newAmount);
            if (oldAmount > newAmount) {
                -1.0 * Math.pow(changeInSoftened, power) * Math.pow(softenedChange, 1.0 - power)
            } else {
                Math.pow(changeInSoftened, power) * Math.pow(softenedChange, 1.0 - power)
            }
        }
        for (i in params.src.changes) {
            double units = inflateUnits(i.height);
            if (ctx._source.trending_score == null) {
                ctx._source.trending_score = 0.0;
            }
            double bigSpike = squashed_multiply(units, squash(spikeMass(i.prev_amount, i.new_amount)));
            ctx._source.trending_score = squashed_add(ctx._source.trending_score, bigSpike);
        }
        """
        start = time.perf_counter()

        def producer():
            for claim_id, claim_updates in params.items():
                yield {
                    '_id': claim_id,
                    '_index': self.index,
                    '_op_type': 'update',
                    'script': {
                        'lang': 'painless',
                        'source': update_trending_score_script,
                        'params': {
                            'src': {
                                'changes': [{
                                    'height': p.height,
                                    'prev_amount': p.prev_amount / 1E8,
                                    'new_amount': p.new_amount / 1E8,
                                } for p in claim_updates]
                            }
                        }
                    },
                }

        if not params:
            return
        async for ok, item in async_streaming_bulk(self.sync_client,
                                                   producer(),
                                                   raise_on_error=False):
            if not ok:
                self.logger.warning("updating trending failed for an item: %s",
                                    item)
        await self.sync_client.indices.refresh(self.index)
        self.logger.info("updated trending scores in %ims",
                         int((time.perf_counter() - start) * 1000))