Ejemplo n.º 1
0
async def test_scan_equal_chunks_for_loop(es, es_clean, populate):
    for n, scroll_size in [
        (0, 1),  # no results
        (6, 6),  # 1 scroll
        (6, 8),  # 1 scroll
        (6, 3),  # 2 scrolls
        (6, 4),  # 2 scrolls
        (6, 2),  # 3 scrolls
        (6, 1),  # 6 scrolls
    ]:
        es_clean()

        index = 'test_aioes'
        doc_type = 'type_1'
        body = {'foo': 1}

        await populate(index, doc_type, n, body)

        ids = set()

        async with Scan(
            es,
            index=index,
            doc_type=doc_type,
            size=scroll_size,
        ) as scan:

            async for doc in scan:
                    ids.add(doc['_id'])

            # check number of unique doc ids
            assert len(ids) == n == scan.total
Ejemplo n.º 2
0
async def test_scan_exception_on_failed_shards(es, populate, mocker):
    index = 'test_aioes'
    doc_type = 'type_2'
    scroll_size = 3
    n = 10

    body = {'foo': 1}
    await populate(index, doc_type, n, body)

    mocker.spy(logger, 'warning')

    i = 0
    async with Scan(
        es,
        index=index,
        doc_type=doc_type,
        size=scroll_size,
    ) as scan:
        with pytest.raises(ScanError) as cm:
            async for doc in scan:  # noqa
                if i == 3:
                    # once after first scroll
                    scan._failed_shards = 1
                    scan._totl_shards = 2
                i += 1

        assert (str(cm.value) ==
                'Scroll request has failed on 1 shards out of 5.')

    assert i == 6
    logger.warning.assert_called_once_with(
        'Scroll request has failed on %d shards out of %d.', 1, 5)
Ejemplo n.º 3
0
async def test_scan_simple(es, populate):
    index = 'test_aioes'
    doc_type = 'type_2'
    scroll_size = 3
    n = 10

    body = {'foo': 1}
    await populate(index, doc_type, n, body)
    ids = set()

    async with Scan(
        es,
        index=index,
        doc_type=doc_type,
        size=scroll_size,
    ) as scan:
        assert isinstance(scan.scroll_id, str)
        assert scan.total == 10
        async for doc in scan:
            ids.add(doc['_id'])
            assert doc == {'_id': mock.ANY,
                           '_index': 'test_aioes',
                           '_score': None,
                           '_source': {'foo': 1},
                           '_type': 'type_2',
                           'sort': mock.ANY}

    assert ids == {str(i) for i in range(10)}
Ejemplo n.º 4
0
async def search(request):
    es = Elasticsearch()

    q = request.query.get('q')
    try:
        limit = int(request.query.get('limit', 0))
        offset = int(request.query.get('offset', 0))
    except:
        return json_response({'response': 'wrong query'})

    body = {}
    if q:
        body['query'] = {'match': {'text': q}}

    async with Scan(
            es,
            index=index_name,
            doc_type='crawler',
            query=body,
    ) as scan_res:
        res_source, count = await format_search(scan_res, limit, offset)
        text = {
            'total_hits': count,
            'count': len(res_source),
            'results': res_source
        }
        return json_response(text)
Ejemplo n.º 5
0
async def test_scan_warning_on_failed_shards(es, populate, mocker):
    index = 'test_aioes'
    doc_type = 'type_2'
    scroll_size = 3
    n = 10

    body = {'foo': 1}
    await populate(index, doc_type, n, body)

    mocker.spy(logger, 'warning')

    async with Scan(
        es,
        index=index,
        doc_type=doc_type,
        size=scroll_size,
        raise_on_error=False,
    ) as scan:
        i = 0
        async for doc in scan:  # noqa
            if i == 3:
                # once after first scroll
                scan._failed_shards = 1
                scan._totl_shards = 2
            i += 1

    logger.warning.assert_called_once_with(
        'Scroll request has failed on %d shards out of %d.', 1, 5)
Ejemplo n.º 6
0
    async def es_range(self, index, tp, *keys, call=None, **query):
        async with Elasticsearch([self.host]) as es:
            async with Scan(
                    es,
                    index=index,
                    doc_type=tp,
                    query=query,
            ) as scan:

                res = []
                count = await es.count(index=index)
                count = count['count']
                progressbar = tqdm(desc="scan all elasticsearch", total=count)
                ic = 0
                si = count / 1000
                async for doc in scan:
                    ic += 1
                    if ic > 0 and ic % 1000 == 0:
                        progressbar.update(si)
                    if call:
                        call(doc)
                    else:
                        dd = {}
                        for k in keys:
                            km = k.split(':')
                            v = doc
                            for kk in km:
                                v = v.get(kk)
                                if not v: break
                            dd[k] = v
                        res.append(dd)
                progressbar.close()
                return res
Ejemplo n.º 7
0
async def search(request):
    logger.info(request.query)
    try:
        schema = SearchViewSchema()
        r = schema.load({**request.query})
        q, limit, offset = r['q'], r.get('limit', 100), r.get('offset', 0)
    except Exception as e:
        r = {'status': 'bad_request', 'reason': str(e)}
        logger.error(r)
        return await json_response(r)

    body = {'query': {'match': {'text': q}}}

    all_documents = [
        '{}://{}'.format('https' if i.https else 'http', i.domain)
        async for i in await CrawlerStats.objects.all()
    ]
    index_names_docs = [
        ''.join([
            i for i in ii if i not in ('[', '"', '*', '\\\\', '\\', '<', '|',
                                       ',', '>', '/', '?', ':')
        ]) for ii in all_documents
    ]

    response_data = {
        'total_hits': 0,
        'count': 0,
        'documents_in_list': [],
        'results': []
    }

    for i, index_name in enumerate(index_names_docs):
        async with Scan(
                es,
                index=index_name,
                doc_type='crawler',
                query=body,
        ) as scan_res:
            res_source = [{
                'id': i['_id'],
                **i['_source']
            } async for i in scan_res]
            response_data['total_hits'] += len(res_source)
            response_data['results'].extend(res_source)

    count = len(response_data['results'])
    if limit:
        response_data['results'] = response_data['results'][
            offset:min(limit + offset, count)]
    else:
        response_data['results'] = response_data['results'][offset:]

    response_data['documents_in_list'] = list(
        set([await get_domain(i['url']) for i in response_data['results']]))
    response_data['count'] = len(response_data['results'])

    r = {'status': 'ok', 'data': response_data}
    logger.info(r)
    return await json_response(r)
Ejemplo n.º 8
0
async def test_scan_no_mask_index(es):
    index = 'undefined-*'
    scroll_size = 3

    async with Scan(
        es,
        index=index,
        size=scroll_size,
    ) as scan:
        assert scan.scroll_id is None
        assert scan.total['value'] == 0
        cnt = 0
        async for doc in scan:  # noqa
            cnt += 1
        assert cnt == 0
Ejemplo n.º 9
0
async def test_scan_no_index(es):
    index = 'undefined'
    doc_type = 'any'
    scroll_size = 3

    async with Scan(
        es,
        index=index,
        doc_type=doc_type,
        size=scroll_size,
    ) as scan:
        assert scan.scroll_id is None
        assert scan.total == 0
        cnt = 0
        async for doc in scan:  # noqa
            cnt += 1
        assert cnt == 0
Ejemplo n.º 10
0
async def test_scan_no_scroll(es, loop, populate):
    index = 'test_aioes'
    n = 10
    scroll_size = 1
    body = {'foo': 1}

    await populate(index, n, body)

    async with Scan(
        es,
        size=scroll_size,
    ) as scan:
        # same comes after search context expiration
        await scan._do_clear_scroll()

        with pytest.raises(NotFoundError):
            async for doc in scan:
                doc
Ejemplo n.º 11
0
def test_scan_scroll_id_without_context_manager(es):
    scan = Scan(es)

    with pytest.raises(RuntimeError):
        scan.scroll_id
Ejemplo n.º 12
0
async def test_scan_async_for_without_context_manager(es):
    scan = Scan(es)

    with pytest.raises(RuntimeError):
        async for doc in scan:
            doc
Ejemplo n.º 13
0
def test_scan_total_without_context_manager(es):
    scan = Scan(es)

    with pytest.raises(RuntimeError):
        scan.total