Beispiel #1
0
def test_events_indexer_id_windowing(app, mock_event_queue):
    """Check that EventsIndexer applies time windows to ids."""

    indexer = EventsIndexer(mock_event_queue,
                            preprocessors=[],
                            double_click_window=180)

    # Generated docs will be registered in this list
    received_docs = []

    def bulk(client, generator, *args, **kwargs):
        received_docs.extend(generator)

    mock_event_queue.consume.return_value = [
        _create_file_download_event(date) for date in [
            # Those two events will be in the same window
            (2017, 6, 1, 0, 11, 3),
            (2017, 6, 1, 0, 9, 1),
            # Those two events will be in the same window
            (2017, 6, 2, 0, 12, 10),
            (2017, 6, 2, 0, 13, 3),
            (2017, 6, 2, 0, 30, 3)
        ]
    ]

    with patch('elasticsearch.helpers.bulk', side_effect=bulk):
        indexer.run()

    assert len(received_docs) == 5
    ids = set(doc['_id'] for doc in received_docs)
    assert len(ids) == 3
Beispiel #2
0
def test_index_prefix(config_with_index_prefix, app, event_queues,
                      es_with_templates):
    es = es_with_templates
    search = Search(using=es)

    # 1) publish events in the queue
    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1), (2018, 1, 2), (2018, 1, 3), (2018, 1, 4)]
    ])

    queue = current_queues.queues['stats-file-download']
    assert get_queue_size('stats-file-download') == 4

    # 2) preprocess events
    indexer = EventsIndexer(queue, preprocessors=[flag_machines, flag_robots])
    indexer.run()
    es.indices.refresh(index='*')

    assert get_queue_size('stats-file-download') == 0

    index_prefix = config_with_index_prefix['SEARCH_INDEX_PREFIX']
    index_name = index_prefix + 'events-stats-file-download'

    assert es.indices.exists(index_name + '-2018-01-01')
    assert es.indices.exists(index_name + '-2018-01-02')
    assert es.indices.exists(index_name + '-2018-01-03')
    assert es.indices.exists(index_name + '-2018-01-04')
    assert es.indices.exists_alias(name=index_name)

    # 3) aggregate events
    aggregate_events(['file-download-agg'])
    es.indices.refresh(index='*')
    es.indices.exists(index_prefix + 'stats-file-download-2018-01')

    # 4) queries
    query_configs = register_queries()
    histo_query = ESDateHistogramQuery(query_name='test_histo',
                                       **query_configs[0]['query_config'])
    results = histo_query.run(bucket_id='B0000000000000000000000000000001',
                              file_key='test.pdf',
                              start_date=datetime.datetime(2018, 1, 1),
                              end_date=datetime.datetime(2018, 1, 3))
    assert len(results['buckets'])
    for day_result in results['buckets']:
        assert int(day_result['value']) == 1

    terms_query = ESTermsQuery(query_name='test_total_count',
                               **query_configs[1]['query_config'])
    results = terms_query.run(bucket_id='B0000000000000000000000000000001',
                              start_date=datetime.datetime(2018, 1, 1),
                              end_date=datetime.datetime(2018, 1, 7))
    assert int(results['buckets'][0]['value']) == 4
Beispiel #3
0
def test_bookmark_removal(app, es_with_templates, mock_event_queue):
    """Remove aggregation bookmark and restart aggregation.

    This simulates the scenario where aggregations have been created but the
    the bookmarks have not been set due to an error.
    """
    mock_event_queue.consume.return_value = [
        _create_file_download_event(date) for date in [
            (2017, 6, 2, 15),  # second event on the same date
            (2017, 7, 1)
        ]
    ]
    indexer = EventsIndexer(mock_event_queue)
    indexer.run()
    current_search_client.indices.refresh(index='*')

    def aggregate_and_check_version(expected_version):
        # Aggregate events
        StatAggregator(name='file-download-agg',
                       event='file-download',
                       aggregation_field='file_id',
                       aggregation_interval='day',
                       query_modifiers=[]).run()
        current_search_client.indices.refresh(index='*')
        res = current_search_client.search(
            index='stats-file-download',
            doc_type='file-download-day-aggregation',
            version=True)
        for hit in res['hits']['hits']:
            assert hit['_version'] == expected_version

    aggregate_and_check_version(1)
    aggregate_and_check_version(1)
    # Delete all bookmarks
    bookmarks = Search(
        using=current_search_client,
        index='stats-file-download',
        doc_type='file-download-agg-bookmark').query('match_all')
    for bookmark in bookmarks:
        res = current_search_client.delete(
            index=bookmark.meta.index,
            id=bookmark.meta.id,
            doc_type='file-download-agg-bookmark')
    current_search_client.indices.refresh(index='*')
    # the aggregations should have been overwritten
    aggregate_and_check_version(2)
Beispiel #4
0
def test_failing_processors(app, es, event_queues, caplog):
    """Test events that raise an exception when processed."""
    search = Search(using=es)

    current_queues.declare()
    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1), (2018, 1, 2), (2018, 1, 3), (2018, 1, 4)]
    ])

    def _raises_on_second_call(doc):
        if _raises_on_second_call.calls == 1:
            _raises_on_second_call.calls += 1
            raise Exception('mocked-exception')
        _raises_on_second_call.calls += 1
        return doc

    _raises_on_second_call.calls = 0

    queue = current_queues.queues['stats-file-download']
    indexer = EventsIndexer(queue, preprocessors=[_raises_on_second_call])

    current_search.flush_and_refresh(index='*')
    assert get_queue_size('stats-file-download') == 4
    assert not es.indices.exists('events-stats-file-download-2018-01-01')
    assert not es.indices.exists('events-stats-file-download-2018-01-02')
    assert not es.indices.exists('events-stats-file-download-2018-01-03')
    assert not es.indices.exists('events-stats-file-download-2018-01-04')
    assert not es.indices.exists_alias(name='events-stats-file-download')

    with caplog.at_level(logging.ERROR):
        indexer.run()  # 2nd event raises exception and is dropped

    # Check that the error was logged
    error_logs = [r for r in caplog.records if r.levelno == logging.ERROR]
    assert len(error_logs) == 1
    assert error_logs[0].msg == 'Error while processing event'
    assert error_logs[0].exc_info[1].args[0] == 'mocked-exception'

    current_search.flush_and_refresh(index='*')
    assert get_queue_size('stats-file-download') == 0
    assert search.index('events-stats-file-download').count() == 3
    assert search.index('events-stats-file-download-2018-01-01').count() == 1
    assert not es.indices.exists('events-stats-file-download-2018-01-02')
    assert search.index('events-stats-file-download-2018-01-03').count() == 1
    assert search.index('events-stats-file-download-2018-01-04').count() == 1
Beispiel #5
0
def test_bookmark_removal(app, es, mock_event_queue):
    """Remove aggregation bookmark and restart aggregation.

    This simulates the scenario where aggregations have been created but the
    the bookmarks have not been set due to an error.
    """
    mock_event_queue.consume.return_value = [
        _create_file_download_event(date) for date in
        [(2017, 6, 2, 15),  # second event on the same date
         (2017, 7, 1)]
    ]
    indexer = EventsIndexer(mock_event_queue)
    indexer.run()
    current_search.flush_and_refresh(index='*')

    def aggregate_and_check_version(expected_version):
        StatAggregator(
            field='file_id',
            interval='day',
            name='file-download-agg',
            event='file-download',
            query_modifiers=[],
        ).run()
        current_search.flush_and_refresh(index='*')
        res = es.search(
            index='stats-file-download', version=True)
        for hit in res['hits']['hits']:
            assert hit['_version'] == expected_version

    aggregate_and_check_version(1)
    aggregate_and_check_version(1)
    # Delete all bookmarks
    bookmarks = Search(using=es, index='stats-bookmarks') \
        .filter('term', aggregation_type='file-download-agg') \
        .execute()

    for bookmark in bookmarks:
        es.delete(
            index=bookmark.meta.index, id=bookmark.meta.id,
            doc_type=get_doctype(bookmark.meta.doc_type)
        )

    current_search.flush_and_refresh(index='*')
    # the aggregations should have been overwritten
    aggregate_and_check_version(2)
Beispiel #6
0
def generate_events(app, file_number=5, event_number=100, robot_event_number=0,
                    start_date=datetime.date(2017, 1, 1),
                    end_date=datetime.date(2017, 1, 7)):
    """Queued events for processing tests."""
    current_queues.declare()

    for t in current_search.put_templates(ignore=[400]):
        pass

    def _unique_ts_gen():
        ts = 0
        while True:
            ts += 1
            yield ts

    def generator_list():
        unique_ts = _unique_ts_gen()
        for file_idx in range(file_number):
            for entry_date in date_range(start_date, end_date):
                file_id = 'F000000000000000000000000000000{}'.\
                    format(file_idx + 1)
                bucket_id = 'B000000000000000000000000000000{}'.\
                    format(file_idx + 1)

                def build_event(is_robot=False):
                    ts = next(unique_ts)
                    return dict(
                        timestamp=datetime.datetime.combine(
                            entry_date,
                            datetime.time(minute=ts % 60,
                                          second=ts % 60)).
                        isoformat(),
                        bucket_id=bucket_id,
                        file_id=file_id,
                        file_key='test.pdf',
                        size=9000,
                        visitor_id=100,
                        is_robot=is_robot
                    )

                for event_idx in range(event_number):
                    yield build_event()
                for event_idx in range(robot_event_number):
                    yield build_event(True)

    mock_queue = Mock()
    mock_queue.consume.return_value = generator_list()
    mock_queue.routing_key = 'stats-file-download'

    EventsIndexer(
        mock_queue,
        preprocessors=[
            build_file_unique_id
        ],
        double_click_window=0
    ).run()
    current_search_client.indices.refresh(index='*')
Beispiel #7
0
def test_events_indexer_preprocessors(app, mock_event_queue):
    """Check that EventsIndexer calls properly the preprocessors."""
    def test_preprocessor1(event):
        event['test1'] = 42
        event['visitor_id'] = 'testuser1'
        return event

    def test_preprocessor2(event):
        event['test2'] = 21
        return event

    indexer = EventsIndexer(mock_event_queue,
                            preprocessors=[
                                build_file_unique_id, test_preprocessor1,
                                test_preprocessor2
                            ])

    # Generate the events
    received_docs = []

    def bulk(client, generator, *args, **kwargs):
        received_docs.extend(generator)

    with patch('elasticsearch.helpers.bulk', side_effect=bulk):
        indexer.run()

    # Process the events as we expect them to be
    expected_docs = []
    for event in mock_event_queue.queued_events:
        event = build_file_unique_id(event)
        event = test_preprocessor1(event)
        event = test_preprocessor2(event)
        _id = hash_id('2017-01-01T00:00:00', event)
        expected_docs.append(
            dict(
                _id=_id,
                _op_type='index',
                _index='events-stats-file-download-2017-01-01',
                _type=get_doctype('stats-file-download'),
                _source=event,
            ))

    assert received_docs == expected_docs
Beispiel #8
0
def test_events_indexer_preprocessors(app, mock_event_queue):
    """Check that EventsIndexer calls properly the preprocessors."""
    def test_preprocessor1(event):
        event['test1'] = 42
        return event

    def test_preprocessor2(event):
        event['test2'] = 21
        return event

    indexer = EventsIndexer(
        mock_event_queue,
        preprocessors=[test_preprocessor1, test_preprocessor2]
    )

    # Generate the events
    received_docs = []

    def bulk(client, generator, *args, **kwargs):
        received_docs.extend(generator)

    with patch('elasticsearch.helpers.bulk', side_effect=bulk):
        indexer.run()

    # Process the events as we expect them to be
    expected_docs = []
    for event in mock_event_queue.queued_events:
        event = test_preprocessor1(event)
        event = test_preprocessor2(event)
        expected_docs.append(dict(
            _op_type='index',
            _index='events-stats-file-download-2017-01-01',
            _type='stats-file-download',
            _source=event,
        ))

    assert received_docs == expected_docs
Beispiel #9
0
def generate_events(app,
                    file_number=5,
                    event_number=100,
                    robot_event_number=0,
                    start_date=datetime.date(2017, 1, 1),
                    end_date=datetime.date(2017, 1, 7)):
    """Queued events for processing tests."""
    current_queues.declare()

    for t in current_search.put_templates(ignore=[400]):
        pass

    def generator_list():
        for file_idx in range(file_number):
            for entry_date in date_range(start_date, end_date):
                entry_date = datetime.datetime.combine(entry_date,
                                                       datetime.time())
                file_id = '{0}-{1}'.format(entry_date.strftime('%Y-%m-%d'),
                                           file_idx)

                def build_event(is_robot=False):
                    return dict(timestamp=entry_date.isoformat(),
                                bucket_id=file_id,
                                file_id=file_id,
                                file_key='test.pdf',
                                visitor_id=100,
                                is_robot=is_robot)

                for event_idx in range(event_number):
                    yield build_event()
                for event_idx in range(robot_event_number):
                    yield build_event(True)

    mock_queue = Mock()
    mock_queue.consume.return_value = generator_list()
    mock_queue.routing_key = 'stats-file-download'

    EventsIndexer(mock_queue, preprocessors=[build_file_unique_id]).run()
    current_search_client.indices.flush(index='*')
Beispiel #10
0
def test_overwriting_aggregations(app, mock_event_queue, es_with_templates):
    """Check that the StatAggregator correctly starts from bookmark.

    1. Create sample file download event and process it.
    2. Run aggregator and write count, in aggregation index.
    3. Create new events and repeat procedure to assert that the
        results within the interval of the previous events
        overwrite the aggregation,
        by checking that the document version has increased.
    """
    class NewDate(datetime.datetime):
        """datetime.datetime mock."""
        # Aggregate at 12:00, thus the day will be aggregated again later
        current_date = (2017, 6, 2, 12)

        @classmethod
        def utcnow(cls):
            return cls(*cls.current_date)

    # Send some events
    event_type = 'file-download'
    mock_event_queue.consume.return_value = [
        _create_file_download_event(date)
        for date in [(2017, 6, 1), (2017, 6, 2, 10)]
    ]

    indexer = EventsIndexer(mock_event_queue)
    indexer.run()
    current_search_client.indices.refresh(index='*')

    # Aggregate events
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])
    current_search_client.indices.refresh(index='*')

    # Send new events, some on the last aggregated day and some far
    # in the future.
    res = current_search_client.search(index='stats-file-download',
                                       version=True)
    for hit in res['hits']['hits']:
        if 'file_id' in hit['_source'].keys():
            assert hit['_version'] == 1

    mock_event_queue.consume.return_value = [
        _create_file_download_event(date) for date in [
            (2017, 6, 2, 15),  # second event on the same date
            (2017, 7, 1)
        ]
    ]
    indexer = EventsIndexer(mock_event_queue)
    indexer.run()
    current_search_client.indices.refresh(index='*')

    # Aggregate again. The aggregation should start from the last bookmark.
    NewDate.current_date = (2017, 7, 2)
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])
    current_search_client.indices.refresh(index='*')

    res = current_search_client.search(
        index='stats-file-download',
        doc_type='file-download-day-aggregation',
        version=True)
    for hit in res['hits']['hits']:
        if hit['_source']['timestamp'] == '2017-06-02T00:00:00':
            assert hit['_version'] == 2
            assert hit['_source']['count'] == 2
        else:
            assert hit['_version'] == 1